CVDatasetTools.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Tools for cross-validation
6  *
7  *
8  *
9  * \author O.Krause
10  * \date 2010-2012
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_DATA_CVDATASETTOOLS_H
36 #define SHARK_DATA_CVDATASETTOOLS_H
37 
38 #include <shark/Data/Dataset.h>
39 #include <shark/Core/Random.h>
40 #include <algorithm>
41 #include <shark/Data/DataView.h>
42 
43 #include <utility> //for std::pair
44 
45 namespace shark {
46 
47 
48 template<class DatasetTypeT>
49 class CVFolds {
50 public:
51  typedef DatasetTypeT DatasetType;
52  typedef typename DatasetType::IndexSet IndexSet;
53 
54  /// \brief Creates an empty set of folds.
55  CVFolds() {}
56  ///\brief partitions set in validation folds indicated by the second argument.
57  ///
58  /// The folds are given as the batch indices of the validation sets
60  DatasetType const &set,
61  std::vector<IndexSet> const &validationIndizes
62  ) : m_dataset(set),m_validationFolds(validationIndizes) {}
63 
65  DatasetType const &set,
66  std::vector<std::size_t> const &foldStart
67  ) : m_dataset(set){
68  for (std::size_t partition = 0; partition != foldStart.size(); partition++) {
69  std::size_t partitionSize = (partition+1 == foldStart.size()) ? set.numberOfBatches() : foldStart[partition+1];
70  partitionSize -= foldStart[partition];
71  //create the set with the indices of the validation set of the current partition
72  //also update the starting element
73  IndexSet validationIndizes(partitionSize);
74  for (std::size_t batch = 0; batch != partitionSize; ++batch) {
75  validationIndizes[batch]=batch+foldStart[partition];
76  }
77  m_validationFolds.push_back(validationIndizes);
78  }
79  }
80 
81  DatasetType training(std::size_t i) const {
82  SIZE_CHECK(i < size());
83  return m_dataset.indexedSubset(trainingFoldIndices(i));
84  }
85  DatasetType validation(std::size_t i) const {
86  SIZE_CHECK(i < size());
87  return m_dataset.indexedSubset(validationFoldIndices(i));
88  }
89 
90  ///\brief returns the indices that make up the i-th validation fold
91  IndexSet const &validationFoldIndices(std::size_t i)const {
92  SIZE_CHECK(i < size());
93  return m_validationFolds[i];
94  }
95 
96  IndexSet trainingFoldIndices(std::size_t i)const {
97  SIZE_CHECK(i < size());
98  IndexSet trainingFold;
99  detail::complement(m_validationFolds[i], m_dataset.numberOfBatches(), trainingFold);
100  return trainingFold;
101  }
102 
103  ///\brief Returns the number of folds of the dataset.
104  std::size_t size()const {
105  return m_validationFolds.size();
106  }
107 
108  /// \brief Returns the dataset underying the folds
109  DatasetType const& dataset()const{
110  return m_dataset;
111  }
112 
113  /// \brief Returns the dataset underying the folds
114  DatasetType& dataset(){
115  return m_dataset;
116  }
117 
118 private:
119  DatasetType m_dataset;
120  std::vector<IndexSet> m_validationFolds;
121  std::size_t m_datasetSize;
122  std::vector<std::size_t> m_validationFoldSizes;
123 };
124 
125 
126 /// auxiliary typedef for createCVSameSizeBalanced and createCVFullyIndexed, stores location index in the first and partition index in the second
127 typedef std::pair< std::vector<std::size_t> , std::vector<std::size_t> > RecreationIndices;
128 
129 namespace detail {
130 
131 ///\brief Version of createCVSameSizeBalanced which works regardless of the label type
132 ///
133 /// Instead of a class label to interpret, this class uses a membership vector for every
134 /// class which members[k][i] returns the positon of the i-th member of class k in the set.
135 template<class I, class L>
137  LabeledData<I,L> &set,
138  std::size_t numberOfPartitions,
139  std::vector< std::vector<std::size_t> > members,
140  std::size_t batchSize,
141  RecreationIndices * cv_indices = NULL //if not NULL: the first vector stores location information, and
142  // the second the partition information. The i-th value of the
143  // first vector shows what the original position of the now i-th
144  // sample was. The i-th value of the second vector shows what
145  // partition that sample now belongs to.
146 ) {
147  std::size_t numInputs = set.numberOfElements();
148  std::size_t numClasses = members.size();
149 
150  //shuffle elements in members
151  for (std::size_t c = 0; c != numClasses; c++) {
152  std::shuffle(members[c].begin(), members[c].end(), random::globalRng);
153  }
154 
155  //calculate number of elements per validation subset in the new to construct container
156  std::size_t nn = numInputs / numberOfPartitions;
157  std::size_t leftOver = numInputs % numberOfPartitions;
158  std::vector<std::size_t> validationSize(numberOfPartitions,nn);
159  for (std::size_t partition = 0; partition != leftOver; partition++) {
160  validationSize[partition]++;
161  }
162 
163  //calculate the size of the batches for every validation part
164  std::vector<std::size_t> partitionStart;
165  std::vector<std::size_t> batchSizes;
166  std::size_t numBatches = batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
167 
168 
169  LabeledData<I,L> newSet(numBatches);//set of empty batches
170  DataView<LabeledData<I,L> > setView(set);//fast access to single elements of the original set
171  std::vector<std::size_t> validationSetStart = partitionStart;//current index for the batch of every fold
172  //partition classes into the validation subsets of newSet
173  std::size_t fold = 0;//current fold
174  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
175 
176  //initialize the list of position indices which can later be used to re-create the fold (via createCV(Fully)Indexed)
177  if ( cv_indices != NULL ) {
178  cv_indices->first.clear();
179  cv_indices->first.resize( numInputs );
180  cv_indices->second.clear();
181  cv_indices->second.resize( numInputs );
182  }
183 
184  size_t j = 0; //for recreation indices
185  for (std::size_t c = 0; c != numClasses; c++) {
186  for (std::size_t i = 0; i != members[c].size(); i++) {
187  std::size_t oldPos = members[c][i];
188  std::size_t batchNumber = validationSetStart[fold];
189 
190  batchElements[fold].push_back(oldPos);
191 
192  if ( cv_indices != NULL ) {
193  cv_indices->first[ j ] = oldPos; //store the position in which the (now) i-th sample previously resided
194  cv_indices->second[ j ] = fold; //store the partition to which the (now) i-th sample gets assigned
195  // old: //(*cv_indices)[ oldPos ] = fold; //store in vector to recreate partition if desired
196  }
197 
198  //if all elements for the current batch are found, create it
199  if (batchElements[fold].size() == batchSizes[batchNumber]) {
200  newSet.batch(validationSetStart[fold]) = subBatch(setView,batchElements[fold]);
201  batchElements[fold].clear();
202  ++validationSetStart[fold];
203  }
204 
205  fold = (fold+1) % numberOfPartitions;
206 
207  j++;
208  }
209  }
210  SHARK_ASSERT( j == numInputs );
211 
212  //swap old and new set
213  swap(set, newSet);
214 
215  //create folds
216  return CVFolds<LabeledData<I,L> >(set,partitionStart);
217 
218 }
219 }//namespace detail
220 
221 /**
222  * \ingroup shark_globals
223  *
224  * @{
225  */
226 
227 //! \brief Create a partition for cross validation
228 //!
229 //! The subset each training examples belongs to
230 //! is drawn independently and uniformly distributed.
231 //! For every partition, all but one subset form the
232 //! training set, while the remaining one is used for
233 //! validation. The partitions can be accessed using
234 //! getCVPartitionName
235 //!
236 //! \param set the input data for which the new partitions are created
237 //! \param numberOfPartitions number of partitions to create
238 //! \param batchSize maximum batch size
239 template<class I,class L>
241  std::size_t numberOfPartitions,
242  std::size_t batchSize=Data<I>::DefaultBatchSize) {
243  std::vector<std::size_t> indices(set.numberOfElements());
244  for (std::size_t i=0; i != set.numberOfElements(); i++)
245  indices[i] = random::discrete(random::globalRng, std::size_t(0), numberOfPartitions - 1);
246  return createCVIndexed(set,numberOfPartitions,indices,batchSize);
247 }
248 
249 //! \brief Create a partition for cross validation
250 //!
251 //! Every subset contains (approximately) the same
252 //! number of elements. For every partition, all
253 //! but one subset form the training set, while the
254 //! remaining one is used for validation. The partitions
255 //! can be accessed using getCVPartitionName
256 //!
257 //! \param numberOfPartitions number of partitions to create
258 //! \param set the input data from which to draw the partitions
259 //! \param batchSize maximum batch size
260 template<class I,class L>
262  std::size_t numInputs = set.numberOfElements();
263 
264  //calculate the number of validation examples for every partition
265  std::vector<std::size_t> validationSize(numberOfPartitions);
266  std::size_t inputsForValidation = numInputs / numberOfPartitions;
267  std::size_t leftOver = numInputs - inputsForValidation * numberOfPartitions;
268  for (std::size_t i = 0; i != numberOfPartitions; i++) {
269  std::size_t vs=inputsForValidation+(i<leftOver);
270  validationSize[i] =vs;
271  }
272 
273  //calculate the size of batches for every validation part and their total number
274  std::vector<std::size_t> partitionStart;
275  std::vector<std::size_t> batchSizes;
276  detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
277 
278  set.repartition(batchSizes);
279  set.shuffle();
280 
281  CVFolds<LabeledData<I,L> > folds(set,partitionStart);
282  return folds;//set;
283 }
284 
285 
286 //! \brief Create a partition for cross validation
287 //!
288 //! Every subset contains (approximately) the same
289 //! number of elements. For every partition, all
290 //! but one subset form the training set, while the
291 //! remaining one is used for validation.
292 //!
293 //! \param numberOfPartitions number of partitions to create
294 //! \param set the input data from which to draw the partitions
295 //! \param batchSize maximum batch size
296 //! \param cv_indices if not NULL [default]: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
297 template<class I>
300  std::size_t numberOfPartitions,
302  RecreationIndices * cv_indices = NULL //if not NULL: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
303 ){
305  std::size_t numInputs = setView.size();
306  std::size_t numClasses = numberOfClasses(set);
307 
308 
309  //find members of each class
310  std::vector< std::vector<std::size_t> > members(numClasses);
311  for (std::size_t i = 0; i != numInputs; i++) {
312  members[setView[i].label].push_back(i);
313  }
314  return detail::createCVSameSizeBalanced(set, numberOfPartitions, members, batchSize, cv_indices);
315 
316 }
317 
318 //! \brief Create a partition for cross validation without changing the dataset
319 //!
320 //! This method behaves similar to createCVIID
321 //! with the difference that batches are not reordered. Thus the batches
322 //! are only rearranged randomly in folds, but the dataset itself is not changed.
323 //!
324 //! \param numberOfPartitions number of partitions to create
325 //! \param set the input data from which to draw the partitions
326 template<class I, class L>
328  LabeledData<I,L> const& set,
329  std::size_t numberOfPartitions
330 ){
331  std::vector<std::size_t> indizes(set.numberOfBatches());
332  for(std::size_t i= 0; i != set.numberOfBatches(); ++i)
333  indizes[i] = i;
334  shark::shuffle(indizes.begin(),indizes.end(), random::globalRng);
335 
336  typedef typename LabeledData<I,L>::IndexSet IndexSet;
337 
338  std::vector<IndexSet> folds;
339  std::size_t partitionSize = set.numberOfBatches()/numberOfPartitions;
340  std::size_t remainder = set.numberOfBatches() - partitionSize*numberOfPartitions;
341  std::vector<std::size_t>::iterator pos = indizes.begin();
342  for(std::size_t i = 0; i!= numberOfPartitions; ++i){
343  std::size_t size = partitionSize;
344  if(remainder> 0){
345  ++size;
346  --remainder;
347  }
348  folds.push_back(IndexSet(pos,pos+size));
349  pos+=size;
350  }
351  return CVFolds<LabeledData<I,L> >(set,folds);
352 }
353 
354 //! \brief Create a partition for cross validation from indices
355 //!
356 //! Create a partition from indices. The indices vector for each sample states of what
357 //! validation partition that sample should become a member. In other words, the index
358 //! maps a sample to a validation partition, meaning that it will become a part of the
359 //! training partition for all other folds.
360 //!
361 //! \param set partitions will be subsets of this set
362 //! \param numberOfPartitions number of partitions to create
363 //! \param indices partition indices of the examples in [0, ..., numberOfPartitions[.
364 //! \param batchSize maximum batch size
365 template<class I,class L>
367  LabeledData<I,L> &set,
368  std::size_t numberOfPartitions,
369  std::vector<std::size_t> indices,
371 ) {
372  std::size_t numInputs = set.numberOfElements();
373  SIZE_CHECK(indices.size() == numInputs);
374  SIZE_CHECK(numberOfPartitions == *std::max_element(indices.begin(),indices.end())+1);
375 
376  //calculate the size of validation partitions
377  std::vector<std::size_t> validationSize(numberOfPartitions,0);
378  for (std::size_t input = 0; input != numInputs; input++) {
379  validationSize[indices[input]]++;
380  }
381 
382  //calculate the size of batches for every validation part and their total number
383  std::vector<std::size_t> partitionStart;
384  std::vector<std::size_t> batchSizes;
385  std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
386 
387  //construct a new set with the correct batch format from the old set
388  LabeledData<I,L> newSet(numBatches);
389  DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
390  std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
391  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
392  for (std::size_t input = 0; input != numInputs; input++) {
393  std::size_t partition = indices[input];
394  batchElements[partition].push_back(input);
395 
396  //if all elements for the current batch are found, create it
397  std::size_t batchNumber = validationSetStart[partition];
398  if (batchElements[partition].size() == batchSizes[batchNumber]) {
399  newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
400  batchElements[partition].clear();
401  ++validationSetStart[partition];
402  }
403  }
404  swap(set, newSet);
405  //now we only need to create the subset itself
406  return CVFolds<LabeledData<I,L> >(set,partitionStart);
407 }
408 
409 
410 
411 //! \brief Create a partition for cross validation from indices for both ordering and partitioning.
412 //!
413 //! Create a partition from indices. There is one index vector assigning an order
414 //! to the samples, and another one assigning each sample to a validation partition.
415 //! That is, given a dataset set, and at the i-th processing step, this function puts
416 //! the order_indices[i]-th sample into the partition_indices[i]-th partition. The
417 //! order_indices part of the above procedure matters if both an inner and
418 //! outer partition are to be recreated: for the inner partition to be recreated, too,
419 //! the outer partition must be recreated in the same order, not just partitioned into
420 //! the same splits.
421 //!
422 //! \param set partitions will be subsets of this set
423 //! \param numberOfPartitions number of partitions to create
424 //! \param indices stores location index in the first and partition index in the second vector
425 //! \param batchSize maximum batch size
426 template<class I,class L>
428  LabeledData<I,L> &set,
429  std::size_t numberOfPartitions,
430  RecreationIndices indices,
432 ) {
433  std::size_t numInputs = set.numberOfElements();
434  SIZE_CHECK(indices.first.size() == numInputs);
435  SIZE_CHECK(indices.second.size() == numInputs);
436  SIZE_CHECK(numberOfPartitions == *std::max_element(indices.second.begin(),indices.second.end())+1);
437 
438  //calculate the size of validation partitions
439  std::vector<std::size_t> validationSize(numberOfPartitions,0);
440  for (std::size_t input = 0; input != numInputs; input++) {
441  validationSize[indices.second[input]]++;
442  }
443 
444  //calculate the size of batches for every validation part and their total number
445  std::vector<std::size_t> partitionStart;
446  std::vector<std::size_t> batchSizes;
447  std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
448 
449  //construct a new set with the correct batch format from the old set
450  LabeledData<I,L> newSet(numBatches);
451  DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
452  std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
453  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
454  for (std::size_t input = 0; input != numInputs; input++) {
455  std::size_t partition = indices.second[input]; //the second vector's contents indicate the partition to assign each sample to.
456  batchElements[partition].push_back( indices.first[input] ); //the first vector's contents indicate from what original position to get the next sample.
457 
458  //if all elements for the current batch are found, create it
459  std::size_t batchNumber = validationSetStart[partition];
460  if (batchElements[partition].size() == batchSizes[batchNumber]) {
461  newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
462  batchElements[partition].clear();
463  ++validationSetStart[partition];
464  }
465  }
466  swap(set, newSet);
467  //now we only need to create the subset itself
468  return CVFolds<LabeledData<I,L> >(set,partitionStart);
469 }
470 
471 
472 // much more to come...
473 
474 /** @}*/
475 }
476 #endif