WeightedDataset.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Weighted data sets for (un-)supervised learning.
6  *
7  *
8  * \par
9  * This file provides containers for data used by the models, loss
10  * functions, and learning algorithms (trainers). The reason for
11  * dedicated containers of this type is that data often need to be
12  * split into subsets, such as training and test data, or folds in
13  * cross-validation. The containers in this file provide memory
14  * efficient mechanisms for managing and providing such subsets.
15  * The speciality of these containers are that they are weighted.
16  *
17  *
18  *
19  * \author O. Krause
20  * \date 2014
21  *
22  *
23  * \par Copyright 1995-2017 Shark Development Team
24  *
25  * <BR><HR>
26  * This file is part of Shark.
27  * <http://shark-ml.org/>
28  *
29  * Shark is free software: you can redistribute it and/or modify
30  * it under the terms of the GNU Lesser General Public License as published
31  * by the Free Software Foundation, either version 3 of the License, or
32  * (at your option) any later version.
33  *
34  * Shark is distributed in the hope that it will be useful,
35  * but WITHOUT ANY WARRANTY; without even the implied warranty of
36  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37  * GNU Lesser General Public License for more details.
38  *
39  * You should have received a copy of the GNU Lesser General Public License
40  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41  *
42  */
43 //===========================================================================
44 
45 #ifndef SHARK_DATA_WEIGHTED_DATASET_H
46 #define SHARK_DATA_WEIGHTED_DATASET_H
47 
48 #include <shark/Data/Dataset.h>
49 namespace shark {
50 
51 ///\brief Input-Label pair of data
52 template<class DataType, class WeightType>
54  DataType data;
55  WeightType weight;
56 
58 
59  template<class DataT, class WeightT>
61  DataT&& data,
62  WeightT&& weight
63  ):data(data),weight(weight){}
64 
65  template<class DataT, class WeightT>
68  ):data(pair.data),weight(pair.weight){}
69 
70  template<class DataT, class WeightT>
72  data = batch.data;
73  weight = batch.weight;
74  return *this;
75  }
77  data = batch.data;
78  weight = batch.weight;
79  return *this;
80  }
81 };
82 
83 template<class D1, class W1, class D2, class W2>
85  using std::swap;
86  swap(std::forward<D1>(p1.data),std::forward<D2>(p2.data));
87  swap(std::forward<W1>(p1.weight),std::forward<W2>(p2.weight));
88 }
89 
90 template<class DataBatchType,class WeightBatchType>
92 private:
95 public:
96  DataBatchType data;
97  WeightBatchType weight;
98 
99  typedef WeightedDataPair<
100  typename DataBatchTraits::value_type,
101  typename WeightBatchTraits::value_type
102  > value_type;
103  typedef WeightedDataPair<
104  decltype(getBatchElement(std::declval<DataBatchType&>(),0)),
105  decltype(getBatchElement(std::declval<WeightBatchType&>(),0))
107  typedef WeightedDataPair<
108  decltype(getBatchElement(std::declval<typename std::add_const<DataBatchType>::type&>(),0)),
109  decltype(getBatchElement(std::declval<typename std::add_const<WeightBatchType>::type&>(),0))
113 
114  template<class D, class W>
116  D&& data,
117  W&& weight
118  ):data(data),weight(weight){}
119 
120  template<class Pair>
122  std::size_t size,Pair const& p
123  ):data(DataBatchTraits::createBatch(p.data,size)),weight(WeightBatchTraits::createBatch(p.weight,size)){}
124 
125  template<class I, class L>
127  data = batch.data;
128  weight = batch.weight;
129  return *this;
130  }
131 
132  std::size_t size()const{
133  return DataBatchTraits::size(data);
134  }
135 
136  iterator begin(){
137  return iterator(*this,0);
138  }
139  const_iterator begin()const{
140  return const_iterator(*this,0);
141  }
142 
143  iterator end(){
144  return iterator(*this,size());
145  }
146  const_iterator end()const{
147  return const_iterator(*this,size());
148  }
149 
150  reference operator[](std::size_t i){
151  return reference(getBatchElement(data,i),getBatchElement(weight,i));
152  }
153  const_reference operator[](std::size_t i)const{
154  return const_reference(getBatchElement(data,i),getBatchElement(weight,i));
155  }
156 };
157 
158 template<class D1, class W1, class D2, class W2>
160  using std::swap;
161  swap(p1.data,p2.data);
162  swap(p1.weight,p2.weight);
163 }
164 
165 template<class DataType, class WeightType>
166 struct Batch<WeightedDataPair<DataType, WeightType> >
167 : public detail::SimpleBatch<
168  WeightedDataBatch<typename detail::element_to_batch<DataType>::type, typename detail::element_to_batch<WeightType>::type>
169 >{};
170 
171 template<class DataType, class WeightType>
172 struct BatchTraits<WeightedDataBatch<DataType, WeightType> >{
173  typedef typename detail::batch_to_element<DataType>::type DataElem;
174  typedef typename detail::batch_to_element<WeightType>::type WeightElem;
176 };
177 
178 
179 namespace detail{
180 template <class DataContainerT>
181 class BaseWeightedDataset : public ISerializable
182 {
183 public:
184  typedef typename DataContainerT::element_type DataType;
185  typedef double WeightType;
186  typedef DataContainerT DataContainer;
187  typedef Data<WeightType> WeightContainer;
188  typedef typename DataContainer::IndexSet IndexSet;
189 
190  // TYPEDEFS FOR PAIRS
191  typedef WeightedDataPair<
192  DataType,
193  WeightType
194  > element_type;
195 
196  typedef WeightedDataBatch<
197  typename DataContainer::batch_type,
198  typename WeightContainer::batch_type
199  > batch_type;
200 
201  // TYPEDEFS FOR BATCH REFERENCES
202  typedef WeightedDataBatch<
203  typename DataContainer::batch_reference,
204  typename WeightContainer::batch_reference
205  > batch_reference;
206  typedef WeightedDataBatch<
207  typename DataContainer::const_batch_reference,
208  typename WeightContainer::const_batch_reference
209  > const_batch_reference;
210 
211  typedef typename Batch<element_type>::reference element_reference;
212  typedef typename Batch<element_type>::const_reference const_element_reference;
213 
214  typedef boost::iterator_range< detail::DataElementIterator<BaseWeightedDataset<DataContainer> > > element_range;
215  typedef boost::iterator_range< detail::DataElementIterator<BaseWeightedDataset<DataContainer> const> > const_element_range;
216  typedef detail::BatchRange<BaseWeightedDataset<DataContainer> > batch_range;
217  typedef detail::BatchRange<BaseWeightedDataset<DataContainer> const> const_batch_range;
218 
219 
220  ///\brief Returns the range of elements.
221  ///
222  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
223  ///element access via begin()/end() in which case data.elements() provides the correct interface
224  const_element_range elements()const{
225  return const_element_range(
226  detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,0,0,0),
227  detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,numberOfBatches(),0,numberOfElements())
228  );
229  }
230  ///\brief Returns therange of elements.
231  ///
232  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
233  ///element access via begin()/end() in which case data.elements() provides the correct interface
234  element_range elements(){
235  return element_range(
236  detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,0,0,0),
237  detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,numberOfBatches(),0,numberOfElements())
238  );
239  }
240 
241  ///\brief Returns the range of batches.
242  ///
243  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
244  ///element access via begin()/end() in which case data.elements() provides the correct interface
245  const_batch_range batches()const{
246  return const_batch_range(this);
247  }
248  ///\brief Returns the range of batches.
249  ///
250  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
251  ///element access via begin()/end() in which case data.elements() provides the correct interface
252  batch_range batches(){
253  return batch_range(this);
254  }
255 
256  ///\brief Returns the number of batches of the set.
257  std::size_t numberOfBatches() const{
258  return m_data.numberOfBatches();
259  }
260  ///\brief Returns the total number of elements.
261  std::size_t numberOfElements() const{
262  return m_data.numberOfElements();
263  }
264 
265  ///\brief Check whether the set is empty.
266  bool empty() const{
267  return m_data.empty();
268  }
269 
270  ///\brief Access to the stored data points as a separate container.
271  DataContainer const& data() const{
272  return m_data;
273  }
274  ///\brief Access to the stored data points as a separate container.
275  DataContainer& data(){
276  return m_data;
277  }
278 
279  ///\brief Access to weights as a separate container.
280  WeightContainer const& weights() const{
281  return m_weights;
282  }
283  ///\brief Access to weights as a separate container.
284  WeightContainer& weights(){
285  return m_weights;
286  }
287 
288  // CONSTRUCTORS
289 
290  ///\brief Constructs an Empty data set.
291  BaseWeightedDataset()
292  {}
293 
294  ///\brief Create an empty set with just the correct number of batches.
295  ///
296  /// The user must initialize the dataset after that by himself.
297  BaseWeightedDataset(std::size_t numBatches)
298  : m_data(numBatches),m_weights(numBatches)
299  {}
300 
301  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
302  ///
303  /// Optionally the desired batch Size can be set
304  ///
305  ///@param size the new size of the container
306  ///@param element the blueprint element from which to create the Container
307  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
308  BaseWeightedDataset(std::size_t size, element_type const& element, std::size_t batchSize)
309  : m_data(size,element.data,batchSize)
310  , m_weights(size,element.weight,batchSize)
311  {}
312 
313  ///\brief Construction from data and a dataset rpresnting the weights
314  ///
315  /// Beware that when calling this constructor the organization of batches must be equal in both
316  /// containers. This Constructor will not reorganize the data!
317  BaseWeightedDataset(DataContainer const& data, Data<WeightType> const& weights)
318  : m_data(data), m_weights(weights)
319  {
320  SHARK_RUNTIME_CHECK(data.numberOfElements() == weights.numberOfElements(), "[ BaseWeightedDataset::WeightedUnlabeledData] number of data and number of weights must agree");
321 #ifndef DNDEBUG
322  for(std::size_t i = 0; i != data.numberOfBatches(); ++i){
323  SIZE_CHECK(batchSize(data.batch(i)) == batchSize(weights.batch(i)));
324  }
325 #endif
326  }
327 
328  ///\brief Construction from data. All points get the same weight assigned
329  BaseWeightedDataset(DataContainer const& data, double weight)
330  : m_data(data), m_weights(data.numberOfBatches())
331  {
332  for(std::size_t i = 0; i != numberOfBatches(); ++i){
333  m_weights.batch(i) = Batch<WeightType>::type(batchSize(m_data.batch(i)),weight);
334  }
335  }
336 
337 
338  // ELEMENT ACCESS
339  element_reference element(std::size_t i){
340  return *(detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,0,0,0)+i);
341  }
342  const_element_reference element(std::size_t i) const{
343  return *(detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,0,0,0)+i);
344  }
345 
346  // BATCH ACCESS
347  batch_reference batch(std::size_t i){
348  return batch_reference(m_data.batch(i),m_weights.batch(i));
349  }
350  const_batch_reference batch(std::size_t i) const{
351  return const_batch_reference(m_data.batch(i),m_weights.batch(i));
352  }
353 
354  // MISC
355 
356  /// from ISerializable
357  void read(InArchive& archive){
358  archive & m_data;
359  archive & m_weights;
360  }
361 
362  /// from ISerializable
363  void write(OutArchive& archive) const{
364  archive & m_data;
365  archive & m_weights;
366  }
367 
368  ///\brief This method makes the vector independent of all siblings and parents.
369  virtual void makeIndependent(){
370  m_weights.makeIndependent();
371  m_data.makeIndependent();
372  }
373 
374  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
375  virtual void shuffle(){
376  shark::shuffle(this->elements().begin(),this->elements().end(), random::globalRng);
377  }
378 
379  void splitBatch(std::size_t batch, std::size_t elementIndex){
380  m_data.splitBatch(batch,elementIndex);
381  m_weights.splitBatch(batch,elementIndex);
382  }
383 
384  /// \brief Appends the contents of another data object to the end
385  ///
386  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
387  /// dataset might change this one as well.
388  void append(BaseWeightedDataset const& other){
389  m_data.append(other.m_data);
390  m_weights.append(other.m_weights);
391  }
392 
393 
394  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
395  ///
396  ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
397  ///However the sum of all batch sizes must be equal to the current number of elements
398  template<class Range>
399  void repartition(Range const& batchSizes){
400  m_data.repartition(batchSizes);
401  m_weights.repartition(batchSizes);
402  }
403 
404  /// \brief Creates a vector with the batch sizes of every batch.
405  ///
406  /// This method can be used together with repartition to ensure
407  /// that two datasets have the same batch structure.
408  std::vector<std::size_t> getPartitioning()const{
409  return m_data.getPartitioning();
410  }
411 
412  friend void swap( BaseWeightedDataset& a, BaseWeightedDataset& b){
413  swap(a.m_data,b.m_data);
414  swap(a.m_weights,b.m_weights);
415  }
416 
417 
418  // SUBSETS
419 
420  ///\brief Fill in the subset defined by the list of indices.
421  BaseWeightedDataset indexedSubset(IndexSet const& indices) const{
422  BaseWeightedDataset subset;
423  subset.m_data = m_data.indexedSubset(indices);
424  subset.m_weights = m_weights.indexedSubset(indices);
425  return subset;
426  }
427 private:
428  DataContainer m_data; /// point data
429  WeightContainer m_weights; /// weight data
430 };
431 }
432 
433 ///
434 /// \brief Weighted data set for unsupervised learning
435 ///
436 /// The WeightedUnlabeledData class extends UnlabeledData for the
437 /// representation of data. In addition it holds and provides access to the corresponding weights.
438 ///
439 /// WeightedUnlabeledData tries to mimic the underlying data as pairs of data points and weights.
440 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
441 /// one access the input batch by batch(i).data and the weights by batch(i).weight
442 ///
443 ///this also holds true for single element access using operator(). Be aware, that direct access to element is
444 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
445 template <class DataT>
446 class WeightedUnlabeledData : public detail::BaseWeightedDataset <UnlabeledData<DataT> >
447 {
448 private:
449  typedef detail::BaseWeightedDataset <UnlabeledData<DataT> > base_type;
450 public:
451  using base_type::data;
452  using base_type::weights;
453  typedef typename base_type::DataType DataType;
454  typedef typename base_type::WeightType WeightType;
455  typedef typename base_type::element_type element_type;
456  typedef DataT InputType;
457 
458  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = UnlabeledData<DataT>::DefaultBatchSize);
459 
460  // CONSTRUCTORS
461 
462  ///\brief Empty data set.
464  {}
465 
466  ///\brief Create an empty set with just the correct number of batches.
467  ///
468  /// The user must initialize the dataset after that by himself.
469  WeightedUnlabeledData(std::size_t numBatches)
470  : base_type(numBatches)
471  {}
472 
473  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
474  ///
475  /// Optionally the desired batch Size can be set
476  ///
477  ///@param size the new size of the container
478  ///@param element the blueprint element from which to create the Container
479  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
480  WeightedUnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
481  : base_type(size,element,batchSize){}
482 
483  ///\brief Construction from data.
484  ///
485  /// Beware that when calling this constructor the organization of batches must be equal in both
486  /// containers. This Constructor will not reorganize the data!
488  : base_type(data,weights)
489  {}
490 
491  ///\brief Construction from data and a constant weight for all elements
492  WeightedUnlabeledData(UnlabeledData<DataType> const& data, double weight)
493  : base_type(data,weight)
494  {}
495 
496  //we additionally add the two below for compatibility with UnlabeledData
497 
498  ///\brief Access to the inputs as a separate container.
499  UnlabeledData<DataT> const& inputs() const{
500  return data();
501  }
502  ///\brief Access to the inputs as a separate container.
504  return data();
505  }
506 
507  ///\brief Returns the Shape of the data.
508  Shape const& shape() const{
509  return data().shape();
510  }
511 
512  ///\brief Returns the Shape of the data.
514  return data().shape();
515  }
516  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
517  ///
518  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
519  ///this to work.
520  WeightedUnlabeledData splice(std::size_t batch){
521  return WeightedUnlabeledData(data().splice(batch),weights().splice(batch));
522  }
523 
525  swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
526  }
527 };
528 
529 ///brief Outstream of elements for weighted data.
530 template<class T>
531 std::ostream &operator << (std::ostream &stream, const WeightedUnlabeledData<T>& d) {
532  for(auto elem: d.elements())
533  stream << elem.weight << " [" << elem.data<<"]"<< "\n";
534  return stream;
535 }
536 
537 /// \brief creates a weighted unweighted data object from two ranges, representing data and weights
538 template<class DataRange, class WeightRange>
539 typename boost::disable_if<
540  boost::is_arithmetic<WeightRange>,
541  WeightedUnlabeledData<
542  typename boost::range_value<DataRange>::type
543  >
544 >::type createUnlabeledDataFromRange(DataRange const& data, WeightRange const& weights, std::size_t batchSize = 0){
545 
546  SHARK_RUNTIME_CHECK(batchSize(data) == batchSize(weights),"Number of datapoints and number of weights must agree");
547 
548  typedef typename boost::range_value<DataRange>::type Data;
549 
550  if (batchSize == 0)
552 
556  );
557 }
558 
559 
560 ///
561 /// \brief Weighted data set for supervised learning
562 ///
563 /// The WeightedLabeledData class extends LabeledData for the
564 /// representation of data. In addition it holds and provides access to the corresponding weights.
565 ///
566 /// WeightedLabeledData tries to mimic the underlying data as pairs of data tuples(input,label) and weights.
567 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
568 /// one access the databatch by batch(i).data and the weights by batch(i).weight. to access the points and labels
569 /// use batch(i).data.input and batch(i).data.label
570 ///
571 ///this also holds true for single element access using operator(). Be aware, that direct access to element is
572 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
573 ///
574 /// It is possible to gains everal views on the set. one can either get access to inputs, labels and weights separately
575 /// or gain access to the unweighted dataset of inputs and labels. Additionally the sets support on-the-fly creation
576 /// of the (inputs,weights) subset for unsupervised weighted learning
577 template <class InputT, class LabelT>
578 class WeightedLabeledData : public detail::BaseWeightedDataset <LabeledData<InputT,LabelT> >
579 {
580 private:
581  typedef detail::BaseWeightedDataset <LabeledData<InputT,LabelT> > base_type;
582 public:
583  typedef typename base_type::DataType DataType;
584  typedef typename base_type::WeightType WeightType;
585  typedef InputT InputType;
586  typedef LabelT LabelType;
587  typedef typename base_type::element_type element_type;
588 
589  using base_type::data;
590  using base_type::weights;
591 
592  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = (LabeledData<InputT,LabelT>::DefaultBatchSize));
593 
594  // CONSTRUCTORS
595 
596  ///\brief Empty data set.
598  {}
599 
600  ///\brief Create an empty set with just the correct number of batches.
601  ///
602  /// The user must initialize the dataset after that by himself.
603  WeightedLabeledData(std::size_t numBatches)
604  : base_type(numBatches)
605  {}
606 
607  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
608  ///
609  /// Optionally the desired batch Size can be set
610  ///
611  ///@param size the new size of the container
612  ///@param element the blueprint element from which to create the Container
613  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
614  WeightedLabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
615  : base_type(size,element,batchSize){}
616 
617  ///\brief Construction from data.
618  ///
619  /// Beware that when calling this constructor the organization of batches must be equal in both
620  /// containers. This Constructor will not reorganize the data!
622  : base_type(data,weights)
623  {}
624 
625  ///\brief Construction from data and a constant weight for all elements
627  : base_type(data,weight)
628  {}
629 
630  ///\brief Access to the inputs as a separate container.
632  return data().inputs();
633  }
634  ///\brief Access to the inputs as a separate container.
636  return data().inputs();
637  }
638 
639  ///\brief Access to the labels as a separate container.
640  Data<LabelType> const& labels() const{
641  return data().labels();
642  }
643  ///\brief Access to the labels as a separate container.
645  return data().labels();
646  }
647 
648  ///\brief Returns the Shape of the inputs.
649  Shape const& inputShape() const{
650  return inputs().shape();
651  }
652 
653  ///\brief Returns the Shape of the inputs.
655  return inputs().shape();
656  }
657 
658  ///\brief Returns the Shape of the labels.
659  Shape const& labelShape() const{
660  return labels().shape();
661  }
662 
663  ///\brief Returns the Shape of the labels.
665  return labels().shape();
666  }
667 
668  /// \brief Constructs an WeightedUnlabeledData object for the inputs.
670  return WeightedUnlabeledData<InputType>(data().inputs(),weights());
671  }
672 
673  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
674  ///
675  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
676  ///this to work.
677  WeightedLabeledData splice(std::size_t batch){
678  return WeightedLabeledData(data().splice(batch),weights().splice(batch));
679  }
680 
682  swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
683  }
684 };
685 
686 ///brief Outstream of elements for weighted labeled data.
687 template<class T, class U>
688 std::ostream &operator << (std::ostream &stream, const WeightedLabeledData<T, U>& d) {
689  for(auto elem: d.elements())
690  stream << elem.weight <<" ("<< elem.data.label << " [" << elem.data.input<<"] )"<< "\n";
691  return stream;
692 }
693 
694 //Stuff for Dimensionality and querying of basic information
695 
696 inline std::size_t numberOfClasses(WeightedUnlabeledData<unsigned int> const& labels){
697  return numberOfClasses(labels.data());
698 }
699 
700 ///\brief Returns the number of members of each class in the dataset.
701 inline std::vector<std::size_t> classSizes(WeightedUnlabeledData<unsigned int> const& labels){
702  return classSizes(labels.data());
703 }
704 
705 ///\brief Return the dimnsionality of points of a weighted dataset
706 template <class InputType>
708  return dataDimension(dataset.data());
709 }
710 
711 ///\brief Return the input dimensionality of a weighted labeled dataset.
712 template <class InputType, class LabelType>
714  return dataDimension(dataset.inputs());
715 }
716 
717 ///\brief Return the label/output dimensionality of a labeled dataset.
718 template <class InputType, class LabelType>
720  return dataDimension(dataset.labels());
721 }
722 ///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
723 template <class InputType>
725  return numberOfClasses(dataset.labels());
726 }
727 
728 ///\brief Returns the number of members of each class in the dataset.
729 template<class InputType, class LabelType>
730 inline std::vector<std::size_t> classSizes(WeightedLabeledData<InputType, LabelType> const& dataset){
731  return classSizes(dataset.labels());
732 }
733 
734 ///\brief Returns the total sum of weights.
735 template<class InputType>
737  double weightSum = 0;
738  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
739  weightSum += sum(dataset.batch(i).weight);
740  }
741  return weightSum;
742 }
743 ///\brief Returns the total sum of weights.
744 template<class InputType, class LabelType>
746  double weightSum = 0;
747  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
748  weightSum += sum(dataset.batch(i).weight);
749  }
750  return weightSum;
751 }
752 
753 /// \brief Computes the cumulative weight of every class.
754 template<class InputType>
756  RealVector weights(numberOfClasses(dataset),0.0);
757  for(auto const& elem: dataset.elements()){
758  weights(elem.data.label) += elem.weight;
759  }
760  return weights;
761 }
762 
763 //creation of weighted datasets
764 
765 /// \brief creates a weighted unweighted data object from two ranges, representing data and weights
766 template<class InputRange,class LabelRange, class WeightRange>
767 typename boost::disable_if<
768  boost::is_arithmetic<WeightRange>,
770  typename boost::range_value<InputRange>::type,
771  typename boost::range_value<LabelRange>::type
772  >
773 >::type createLabeledDataFromRange(InputRange const& inputs, LabelRange const& labels, WeightRange const& weights, std::size_t batchSize = 0){
774 
775  SHARK_RUNTIME_CHECK(batchSize(inputs) == batchSize(labels),
776  "number of inputs and number of labels must agree");
777  SHARK_RUNTIME_CHECK(batchSize(inputs) == batchSize(weights),
778  "number of data points and number of weights must agree");
779  typedef typename boost::range_value<InputRange>::type InputType;
780  typedef typename boost::range_value<LabelRange>::type LabelType;
781 
782  if (batchSize == 0)
784 
786  createLabeledDataFromRange(inputs,labels,batchSize),
788  );
789 }
790 
791 /// \brief Creates a bootstrap partition of a labeled dataset and returns it using weighting.
792 ///
793 /// Bootstrapping resamples the dataset by drawing a set of points with
794 /// replacement. Thus the sampled set will contain some points multiple times
795 /// and some points not at all. Bootstrapping is usefull to obtain unbiased
796 /// measurements of the mean and variance of an estimator.
797 ///
798 /// Optionally the size of the bootstrap (that is, the number of sampled points)
799 /// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
800 template<class InputType, class LabelType>
802  LabeledData<InputType,LabelType> const& dataset,
803  std::size_t bootStrapSize = 0
804 ){
805  if(bootStrapSize == 0)
806  bootStrapSize = dataset.numberOfElements();
807 
808  WeightedLabeledData<InputType,LabelType> bootstrapSet(dataset,0.0);
809 
810  for(std::size_t i = 0; i != bootStrapSize; ++i){
811  std::size_t index = random::discrete(random::globalRng, std::size_t(0),bootStrapSize-1);
812  bootstrapSet.element(index).weight += 1.0;
813  }
814  bootstrapSet.inputShape() = dataset.inputShape();
815  bootstrapSet.labelShape() = dataset.labelShape();
816  return bootstrapSet;
817 }
818 
819 /// \brief Creates a bootstrap partition of an unlabeled dataset and returns it using weighting.
820 ///
821 /// Bootstrapping resamples the dataset by drawing a set of points with
822 /// replacement. Thus the sampled set will contain some points multiple times
823 /// and some points not at all. Bootstrapping is usefull to obtain unbiased
824 /// measurements of the mean and variance of an estimator.
825 ///
826 /// Optionally the size of the bootstrap (that is, the number of sampled points)
827 /// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
828 template<class InputType>
830  UnlabeledData<InputType> const& dataset,
831  std::size_t bootStrapSize = 0
832 ){
833  if(bootStrapSize == 0)
834  bootStrapSize = dataset.numberOfElements();
835 
836  WeightedUnlabeledData<InputType> bootstrapSet(dataset,0.0);
837 
838  for(std::size_t i = 0; i != bootStrapSize; ++i){
839  std::size_t index = random::discrete(random::globalRng, std::size_t(0),bootStrapSize-1);
840  bootstrapSet.element(index).weight += 1.0;
841  }
842  bootstrapSet.shape() = dataset.shape();
843  return bootstrapSet;
844 }
845 
846 }
847 
848 #endif