DataDistribution.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  * \brief Learning problems given by analytic distributions.
4  *
5  *
6  * \author T. Glasmachers
7  * \date 2006-2013
8  *
9  *
10  * <BR><HR>
11  * This file is part of Shark. This library is free software;
12  * you can redistribute it and/or modify it under the terms of the
13  * GNU General Public License as published by the Free Software
14  * Foundation; either version 3, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this library; if not, see <http://www.gnu.org/licenses/>.
23  *
24  */
25 //===========================================================================
26 
27 
28 #ifndef SHARK_DATA_DATADISTRIBUTION_H
29 #define SHARK_DATA_DATADISTRIBUTION_H
30 
31 #include <shark/Data/Dataset.h>
32 #include <shark/Rng/GlobalRng.h>
33 #include <utility>
34 
35 namespace shark {
36 
37 
38 ///
39 /// \brief A DataDistribution defines an unsupervised learning problem.
40 ///
41 /// \par
42 /// The unsupervised learning problem is defined by an explicit
43 /// distribution (in contrast to a finite dataset). The only
44 /// method we need is to draw a sample from the distribution.
45 ///
46 template <class InputType>
48 {
49 public:
50  /// \brief Virtual destructor.
51  virtual ~DataDistribution() { }
52 
53  /// \brief Generates a single pair of input and label.
54  ///
55  /// @param input the generated input
56  virtual void draw(InputType& input) const = 0;
57 
58  // \brief Interface for std::generate.
59  InputType operator() () {
60  InputType ret;
61  draw(ret);
62  return ret;
63  }
64 
65  /// \brief Generates a data set with samples from from the distribution.
66  ///
67  /// @param size the number of samples in the dataset
68  /// @param maximumBatchSize the maximum size of a batch
69  UnlabeledData<InputType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const {
70  std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
71  std::size_t optimalBatchSize = size / batches;
72  std::size_t remainder = size - batches * optimalBatchSize;
73  UnlabeledData<InputType> dataset(batches);
74  InputType input;
75 
76  // now create and fill the batches, taking the remainder into account
77  for (std::size_t i=0; i<batches; ++i)
78  {
79  std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
80  typename UnlabeledData<InputType>::batch_reference b = dataset.batch(i);
81  draw(input);
82  b = Batch<InputType>::createBatch(input, batchsize);
83  for (std::size_t j=0; j<batchsize; j++)
84  {
85  if (j != 0) draw(input);
86  shark::get(b, j) = input;
87  }
88  }
89  return dataset;
90  }
91 
92  /// \brief Generates a data set with samples from from the distribution.
93  ///
94  /// @param size the number of samples in the dataset
97  }
98 };
99 
100 
101 ///
102 /// \brief A LabeledDataDistribution defines a supervised learning problem.
103 ///
104 /// \par
105 /// The supervised learning problem is defined by an explicit
106 /// distribution (in contrast to a finite dataset). The only
107 /// method we need is to draw a sample from the distribution.
108 ///
109 template <class InputType, class LabelType>
111 {
112 public:
113  /// \brief Virtual destructor.
115 
116  /// \brief Generates a single pair of input and label.
117  /// @param input the generated input
118  /// @param label the generated label
119  virtual void draw(InputType& input, LabelType& label) const = 0;
120 
121  // \Brief Interface for std::generate.
122  std::pair<InputType,LabelType> operator() () {
123  std::pair<InputType,LabelType> ret;
124  draw(ret.first,ret.second);
125  return ret;
126  }
127 
128  /// \brief Generates a dataset with samples from from the distribution.
129  ///
130  /// @param size the number of samples in the dataset
131  /// @param maximumBatchSize the maximum size of a batch
132  LabeledData<InputType, LabelType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const
133  {
134  // first determine the optimal number of batches and their sizes
135  std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
136  std::size_t optimalBatchSize = size / batches;
137  std::size_t remainder = size - batches * optimalBatchSize;
138  LabeledData<InputType, LabelType> dataset(batches);
139  InputType input;
140  LabelType label;
141  DataPair<InputType, LabelType> pair(input, label);
142 
143  // now create and fill the batches, taking the remainder into account
144  for (std::size_t i=0; i<batches; ++i)
145  {
146  std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
148  draw(input, label); pair.input = input; pair.label = label;
150  for (std::size_t j=0; j<batchsize; j++)
151  {
152  if (j != 0) draw(input, label);
153  shark::get(b, j).input = input;
154  shark::get(b, j).label = label;
155  }
156  }
157  return dataset;
158  }
159 
160  /// \brief Generates a data set with samples from from the distribution.
161  ///
162  /// @param size the number of samples in the dataset
165  }
166 };
167 
168 
169 ///
170 /// \brief "chess board" problem for binary classification
171 ///
172 class Chessboard : public LabeledDataDistribution<RealVector, unsigned int>
173 {
174 public:
175  Chessboard(unsigned int size = 4, double noiselevel = 0.0)
176  {
177  m_size = size;
178  m_noiselevel = noiselevel;
179  }
180 
181 
182  void draw(RealVector& input, unsigned int& label)const{
183  input.resize(2);
184  unsigned int j, t = 0;
185  for (j = 0; j < 2; j++)
186  {
187  double v = Rng::uni(0.0, (double)m_size);
188  t += (int)floor(v);
189  input(j) = v;
190  }
191  label = (t & 1);
192  if (Rng::uni(0.0, 1.0) < m_noiselevel) label = 1 - label;
193  }
194 
195 protected:
196  unsigned int m_size;
197  double m_noiselevel;
198 };
199 
200 
201 ///
202 /// \brief Noisy sinc function: y = sin(x) / x + noise
203 ///
204 class Wave : public LabeledDataDistribution<RealVector, RealVector>
205 {
206 public:
207  Wave(double stddev = 0.1, double range = 5.0){
208  m_stddev = stddev;
209  m_range = range;
210  }
211 
212 
213  void draw(RealVector& input, RealVector& label)const{
214  input.resize(1);
215  label.resize(1);
216  input(0) = Rng::uni(-m_range, m_range);
217  if(input(0) != 0)
218  label(0) = sin(input(0)) / input(0) + Rng::gauss(0.0, m_stddev);
219  else
220  label(0) = Rng::gauss(0.0, m_stddev);
221  }
222 
223 protected:
224  double m_stddev;
225  double m_range;
226 };
227 
228 
229 
230 /// "Pami Toy" problem for binary classification, as used in the article "Glasmachers
231 /// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
232 /// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
233 /// In summary, the first M dimensions are correlated to the labels, the last N dimensions
234 /// are not.
235 class PamiToy : public LabeledDataDistribution<RealVector, unsigned int>
236 {
237 public:
238  PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 )
239  : m_size( size_useful+size_noise ),
240  m_sizeUseful( size_useful ),
241  m_sizeNoise( size_noise ),
242  m_noisePos( noise_position) ,
243  m_noiseVar( noise_variance )
244  { }
245 
246  void draw(RealVector& input, unsigned int& label)const{
247  input.resize( m_size );
248  label = Rng::discrete( 0, 1 ); //fix label first
249  double y2 = label - 0.5; //"clean" informative feature values
250  // now fill the informative features..
251  for ( unsigned int i=0; i<m_sizeUseful; i++ ) {
252  input(i) = y2 + Rng::gauss( m_noisePos, m_noiseVar );
253  }
254  // ..and the uninformative ones
255  for ( unsigned int i=m_sizeUseful; i<m_size; i++ ) {
256  input(i) = Rng::gauss( m_noisePos, m_noiseVar );
257  }
258  }
259 
260 protected:
261  unsigned int m_size;
262  unsigned int m_sizeUseful;
263  unsigned int m_sizeNoise;
264  double m_noisePos;
265  double m_noiseVar;
266 };
267 
268 /// This class randomly fills a (hyper-)square with data points. Points which
269 /// happen to be within a (hyper-)circle centered in the square of a certain
270 /// radius get a positive class label. Noise on the labels can be added.
271 class CircleInSquare : public LabeledDataDistribution<RealVector, unsigned int>
272 {
273 public:
274  CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false )
275  : m_dimensions( dimensions ),
276  m_noiselevel( noiselevel ),
277  m_lowerLimit( -1 ),
278  m_upperLimit( 1 ),
279  m_centerpoint( 0 ),
280  m_inner_radius2( 0.5*0.5 ),
281  m_outer_radius2( 0.5*0.5 ),
282  m_equal_class_prob( class_prob_equal )
283  { }
284 
285  /// allow for arbitrary box limits
286  void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius )
287  {
288  RANGE_CHECK( lower_limit < upper_limit );
289  RANGE_CHECK( inner_radius <= outer_radius );
290  RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit );
291  m_lowerLimit = lower_limit;
292  m_upperLimit = upper_limit;
293  m_centerpoint = (upper_limit-lower_limit)/2.0;
294  m_inner_radius2 = inner_radius*inner_radius;
295  m_outer_radius2 = outer_radius*outer_radius;
296  }
297 
298  void draw(RealVector& input, unsigned int& label)
299  {
300  input.resize( m_dimensions );
301  double v, dist;
302 
303  if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/
304  bool this_label = Rng::coinToss();
305  label = ( this_label ? 1 : 0 );
306  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
307  label = 1 - label;
308  if ( this_label ) {
309  do {
310  dist = 0.0;
311  for ( unsigned int i=0; i<m_dimensions; i++ ) {
312  v = Rng::uni( m_lowerLimit, m_upperLimit );
313  input(i) = v;
314  dist += (v-m_centerpoint)*(v-m_centerpoint);
315  }
316  } while( dist > m_inner_radius2 );
317  }
318  else {
319  do {
320  dist = 0.0;
321  for ( unsigned int i=0; i<m_dimensions; i++ ) {
322  v = Rng::uni( m_lowerLimit, m_upperLimit );
323  input(i) = v;
324  dist += (v-m_centerpoint)*(v-m_centerpoint);
325  }
326  } while( dist < m_outer_radius2 );
327  }
328  }
329  else { //equal probability to be anywhere in the cube
330  do {
331  dist = 0.0;
332  for ( unsigned int i=0; i<m_dimensions; i++ ) {
333  v = Rng::uni( m_lowerLimit, m_upperLimit );
334  input(i) = v;
335  dist += (v-m_centerpoint)*(v-m_centerpoint);
336  }
337  label = ( dist < m_inner_radius2 ? 1 : 0 );
338  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
339  label = 1 - label;
340  } while( dist > m_inner_radius2 && dist < m_outer_radius2 );
341  }
342  }
343 
344 protected:
345  unsigned int m_dimensions;
346  double m_noiselevel;
347  double m_lowerLimit;
348  double m_upperLimit;
352  bool m_equal_class_prob; ///<if true, the probability to belong to either class is equal. if false, it is uniform over the cube.
353 };
354 
355 // This class randomly fills a 4x4 square in the 2D-plane with data points.
356 // Points in the lower left diagonal half are negative, points in the
357 // upper right diagonal half are positive. But additionally, all points
358 // in a circle located in the lower right quadrant are positive, effectively
359 // bulging the decision boundary inward. Noise on the labels can be added.
360 class DiagonalWithCircle : public LabeledDataDistribution<RealVector, unsigned int>
361 {
362 public:
363  DiagonalWithCircle( double radius = 1.0, double noise = 0.0 )
364  : m_radius2( radius*radius ),
366  { }
367 
368  void draw(RealVector& input, unsigned int& label)
369  {
370  input.resize( 2 );
371  double x,y;
372  x = Rng::uni( 0, 4 ); //zero is left
373  y = Rng::uni( 0, 4 ); //zero is bottom
374  // assign label according to position w.r.t. the diagonal
375  if ( x+y < 4 )
376  label = 1;
377  else
378  label = 0;
379  // but if in the circle (even above diagonal), assign positive label
380  if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 )
381  label = 1;
382 
383  // add noise
384  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
385  label = 1 - label;
386  input(0) = x;
387  input(1) = y;
388  }
389 
390 protected:
391  double m_radius2;
392  double m_noiselevel;
393 };
394 
395 }
396 #endif