DataDistribution.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Learning problems given by analytic distributions.
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date 2006-2013
12  *
13  *
14  * \par Copyright 1995-2017 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://shark-ml.org/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_DATA_DATADISTRIBUTION_H
38 #define SHARK_DATA_DATADISTRIBUTION_H
39 
40 #include <shark/Data/Dataset.h>
41 #include <shark/Core/Random.h>
43 #include <utility>
44 
45 namespace shark {
46 
47 
48 ///
49 /// \brief A DataDistribution defines an unsupervised learning problem.
50 ///
51 /// \par
52 /// The unsupervised learning problem is defined by an explicit
53 /// distribution (in contrast to a finite dataset). The only
54 /// method we need is to draw a sample from the distribution.
55 ///
56 template <class InputType>
58 {
59 public:
60  /// \brief Virtual destructor.
61  virtual ~DataDistribution() { }
62 
63  /// \brief Generates a single pair of input and label.
64  ///
65  /// @param input the generated input
66  virtual void draw(InputType& input) const = 0;
67 
68  // \brief Interface for std::generate.
70  InputType ret;
71  draw(ret);
72  return ret;
73  }
74 
75  /// \brief Generates a data set with samples from from the distribution.
76  ///
77  /// @param size the number of samples in the dataset
78  /// @param maximumBatchSize the maximum size of a batch
79  UnlabeledData<InputType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const {
80  std::vector<InputType> data(size);
81 
82  // draw the samples
83  for (std::size_t i = 0; i < size; ++i){
84  draw(data[i]);
85  }
86  //create dataset
87  return createUnlabeledDataFromRange(data,maximumBatchSize);
88  }
89 
90  /// \brief Generates a data set with samples from from the distribution.
91  ///
92  /// @param size the number of samples in the dataset
93  UnlabeledData<InputType> generateDataset(std::size_t size) const {
95  }
96 };
97 
98 
99 ///
100 /// \brief A LabeledDataDistribution defines a supervised learning problem.
101 ///
102 /// \par
103 /// The supervised learning problem is defined by an explicit
104 /// distribution (in contrast to a finite dataset). The only
105 /// method we need is to draw a sample from the distribution.
106 ///
107 template <class InputType, class LabelType>
109 {
110 public:
111  /// \brief Virtual destructor.
113 
114  /// \brief Generates a single pair of input and label.
115  /// @param input the generated input
116  /// @param label the generated label
117  virtual void draw(InputType& input, LabelType& label) const = 0;
118 
119  // \Brief Interface for std::generate.
120  std::pair<InputType,LabelType> operator() () {
121  std::pair<InputType,LabelType> ret;
122  draw(ret.first,ret.second);
123  return ret;
124  }
125 
126  /// \brief Generates a dataset with samples from from the distribution.
127  ///
128  /// @param size the number of samples in the dataset
129  /// @param maximumBatchSize the maximum size of a batch
130  LabeledData<InputType, LabelType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const{
131  std::vector<InputType> inputs(size);
132  std::vector<LabelType> labels(size);
133 
134  // draw the samples
135  for (std::size_t i = 0; i < size; ++i){
136  draw(inputs[i], labels[i]);
137  }
138  //create dataset
139  return createLabeledDataFromRange(inputs,labels,maximumBatchSize);
140  }
141 
142  /// \brief Generates a data set with samples from from the distribution.
143  ///
144  /// @param size the number of samples in the dataset
147  }
148 };
149 
150 
151 ///
152 /// \brief "chess board" problem for binary classification
153 ///
154 class Chessboard : public LabeledDataDistribution<RealVector, unsigned int>
155 {
156 public:
157  Chessboard(unsigned int size = 4, double noiselevel = 0.0)
158  {
159  m_size = size;
160  m_noiselevel = noiselevel;
161  }
162 
163 
164  void draw(RealVector& input, unsigned int& label)const{
165  input.resize(2);
166  unsigned int j, t = 0;
167  for (j = 0; j < 2; j++)
168  {
169  double v = random::uni(random::globalRng, 0.0, (double)m_size);
170  t += (int)floor(v);
171  input(j) = v;
172  }
173  label = (t & 1);
174  if (random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel) label = 1 - label;
175  }
176 
177 protected:
178  unsigned int m_size;
179  double m_noiselevel;
180 };
181 
182 
183 ///
184 /// \brief Noisy sinc function: y = sin(x) / x + noise
185 ///
186 class Wave : public LabeledDataDistribution<RealVector, RealVector>
187 {
188 public:
189  Wave(double stddev = 0.1, double range = 5.0){
190  m_stddev = stddev;
191  m_range = range;
192  }
193 
194 
195  void draw(RealVector& input, RealVector& label)const{
196  input.resize(1);
197  label.resize(1);
198  input(0) = random::uni(random::globalRng, -m_range, m_range);
199  if(input(0) != 0)
200  label(0) = sin(input(0)) / input(0) + random::gauss(random::globalRng, 0.0, m_stddev);
201  else
202  label(0) = random::gauss(random::globalRng, 0.0, m_stddev);
203  }
204 
205 protected:
206  double m_stddev;
207  double m_range;
208 };
209 
210 
211 
212 /// "Pami Toy" problem for binary classification, as used in the article "Glasmachers
213 /// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
214 /// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
215 /// In summary, the first M dimensions are correlated to the labels, the last N dimensions
216 /// are not.
217 class PamiToy : public LabeledDataDistribution<RealVector, unsigned int>
218 {
219 public:
220  PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 )
221  : m_size( size_useful+size_noise ),
222  m_sizeUseful( size_useful ),
223  m_sizeNoise( size_noise ),
224  m_noisePos( noise_position) ,
225  m_noiseVar( noise_variance )
226  { }
227 
228  void draw(RealVector& input, unsigned int& label)const{
229  input.resize( m_size );
230  label = (unsigned int) random::discrete(random::globalRng, 0,1); //fix label first
231  double y2 = label - 0.5; //"clean" informative feature values
232  // now fill the informative features..
233  for ( unsigned int i=0; i<m_sizeUseful; i++ ) {
234  input(i) = y2 + random::gauss(random::globalRng, m_noisePos, m_noiseVar );
235  }
236  // ..and the uninformative ones
237  for ( unsigned int i=m_sizeUseful; i<m_size; i++ ) {
238  input(i) = random::gauss(random::globalRng, m_noisePos, m_noiseVar );
239  }
240  }
241 
242 protected:
243  unsigned int m_size;
244  unsigned int m_sizeUseful;
245  unsigned int m_sizeNoise;
246  double m_noisePos;
247  double m_noiseVar;
248 };
249 
250 /// This class randomly fills a (hyper-)square with data points. Points which
251 /// happen to be within a (hyper-)circle centered in the square of a certain
252 /// radius get a positive class label. Noise on the labels can be added.
253 class CircleInSquare : public LabeledDataDistribution<RealVector, unsigned int>
254 {
255 public:
256  CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false )
257  : m_dimensions( dimensions ),
258  m_noiselevel( noiselevel ),
259  m_lowerLimit( -1 ),
260  m_upperLimit( 1 ),
261  m_centerpoint( 0 ),
262  m_inner_radius2( 0.5*0.5 ),
263  m_outer_radius2( 0.5*0.5 ),
264  m_equal_class_prob( class_prob_equal )
265  { }
266 
267  /// allow for arbitrary box limits
268  void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius )
269  {
270  RANGE_CHECK( lower_limit < upper_limit );
271  RANGE_CHECK( inner_radius <= outer_radius );
272  RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit );
273  m_lowerLimit = lower_limit;
274  m_upperLimit = upper_limit;
275  m_centerpoint = (upper_limit-lower_limit)/2.0;
276  m_inner_radius2 = inner_radius*inner_radius;
277  m_outer_radius2 = outer_radius*outer_radius;
278  }
279 
280  void draw(RealVector& input, unsigned int& label)const
281  {
282  input.resize( m_dimensions );
283  double v, dist;
284 
285  if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/
286  bool this_label = random::coinToss(random::globalRng);
287  label = ( this_label ? 1 : 0 );
288  if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
289  label = 1 - label;
290  if ( this_label ) {
291  do {
292  dist = 0.0;
293  for ( unsigned int i=0; i<m_dimensions; i++ ) {
294  v = random::uni(random::globalRng, m_lowerLimit, m_upperLimit );
295  input(i) = v;
296  dist += (v-m_centerpoint)*(v-m_centerpoint);
297  }
298  } while( dist > m_inner_radius2 );
299  }
300  else {
301  do {
302  dist = 0.0;
303  for ( unsigned int i=0; i<m_dimensions; i++ ) {
304  v = random::uni(random::globalRng, m_lowerLimit, m_upperLimit );
305  input(i) = v;
306  dist += (v-m_centerpoint)*(v-m_centerpoint);
307  }
308  } while( dist < m_outer_radius2 );
309  }
310  }
311  else { //equal probability to be anywhere in the cube
312  do {
313  dist = 0.0;
314  for ( unsigned int i=0; i<m_dimensions; i++ ) {
315  v = random::uni(random::globalRng, m_lowerLimit, m_upperLimit );
316  input(i) = v;
317  dist += (v-m_centerpoint)*(v-m_centerpoint);
318  }
319  label = ( dist < m_inner_radius2 ? 1 : 0 );
320  if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
321  label = 1 - label;
322  } while( dist > m_inner_radius2 && dist < m_outer_radius2 );
323  }
324  }
325 
326 protected:
327  unsigned int m_dimensions;
328  double m_noiselevel;
329  double m_lowerLimit;
330  double m_upperLimit;
334  bool m_equal_class_prob; ///<if true, the probability to belong to either class is equal. if false, it is uniform over the cube.
335 };
336 
337 // This class randomly fills a 4x4 square in the 2D-plane with data points.
338 // Points in the lower left diagonal half are negative, points in the
339 // upper right diagonal half are positive. But additionally, all points
340 // in a circle located in the lower right quadrant are positive, effectively
341 // bulging the decision boundary inward. Noise on the labels can be added.
342 class DiagonalWithCircle : public LabeledDataDistribution<RealVector, unsigned int>
343 {
344 public:
345  DiagonalWithCircle( double radius = 1.0, double noise = 0.0 )
346  : m_radius2( radius*radius ),
347  m_noiselevel( noise )
348  { }
349 
350  void draw(RealVector& input, unsigned int& label)const
351  {
352  input.resize( 2 );
353  double x,y;
354  x = random::uni(random::globalRng, 0, 4 ); //zero is left
355  y = random::uni(random::globalRng, 0, 4 ); //zero is bottom
356  // assign label according to position w.r.t. the diagonal
357  if ( x+y < 4 )
358  label = 1;
359  else
360  label = 0;
361  // but if in the circle (even above diagonal), assign positive label
362  if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 )
363  label = 1;
364 
365  // add noise
366  if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
367  label = 1 - label;
368  input(0) = x;
369  input(1) = y;
370  }
371 
372 protected:
373  double m_radius2;
374  double m_noiselevel;
375 };
376 
377 
378 /// \brief Generates a set of normally distributed points
380 {
381 public:
382  /// \brief Generates a simple distribution with
383  NormalDistributedPoints(std::size_t dim): m_offset(dim,0){
384  RealMatrix covariance(dim,dim,0);
385  diag(covariance) = blas::repeat(1.0,dim);
386  m_dist.setCovarianceMatrix(covariance);
387  }
388  NormalDistributedPoints(RealMatrix const& covariance, RealVector const& offset)
389  :m_dist(covariance), m_offset(offset){
390  SIZE_CHECK(offset.size() == covariance.size1());
391  }
392  void draw(RealVector& input) const{
393  input.resize(m_offset.size());
394  noalias(input) = m_offset;
395  noalias(input) += m_dist(random::globalRng).first;
396  }
397 private:
399  RealVector m_offset;
400 };
401 
402 /// \brief Given a set of images, draws a set of image patches of a given size
403 class ImagePatches:public DataDistribution<RealVector>{
404 public:
406  Data<RealVector> images,
407  std::size_t imageWidth, std::size_t imageHeight,
408  std::size_t patchWidth, std::size_t patchHeight
409  ):m_images(images)
410  , m_imageWidth(imageWidth)
411  , m_imageHeight(imageHeight)
412  , m_patchWidth(patchWidth)
413  , m_patchHeight(patchHeight)
414  ,m_numImages(m_images.numberOfElements()){}
415 
416  void draw(RealVector& input) const{
417  //sample image
418  std::size_t imageNum = random::discrete(random::globalRng, std::size_t(0),m_numImages-1);
419  Data<RealVector>::const_element_reference image = m_images.element(imageNum);
420  //draw the upper left corner of the image
421  std::size_t m_startX = random::discrete(random::globalRng, std::size_t(0),m_imageWidth-m_patchWidth);
422  std::size_t m_startY = random::discrete(random::globalRng, std::size_t(0),m_imageHeight-m_patchHeight);
423 
424 
425  //copy patch
426  input.resize(m_patchWidth * m_patchHeight);
427  std::size_t rowStart = m_startY * m_imageWidth + m_startX;
428  for (size_t y = 0; y < m_patchHeight; ++y){
429  for (size_t x = 0; x < m_patchWidth; ++x){
430  input(y * m_patchWidth + x) = image(rowStart+x);
431  }
432  rowStart += m_imageWidth;
433  }
434  }
435 private:
436  Data<RealVector> m_images;
437  std::size_t m_imageWidth;
438  std::size_t m_imageHeight;
439  std::size_t m_patchWidth;
440  std::size_t m_patchHeight;
441  std::size_t m_numImages;
442 };
443 
444 }
445 #endif