Dataset.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Data for (un-)supervised learning.
6  *
7  *
8  * \par
9  * This file provides containers for data used by the models, loss
10  * functions, and learning algorithms (trainers). The reason for
11  * dedicated containers of this type is that data often need to be
12  * split into subsets, such as training and test data, or folds in
13  * cross-validation. The containers in this file provide memory
14  * efficient mechanisms for managing and providing such subsets.
15  *
16  *
17  *
18  *
19  * \author O. Krause, T. Glasmachers
20  * \date 2010-2014
21  *
22  *
23  * \par Copyright 1995-2017 Shark Development Team
24  *
25  * <BR><HR>
26  * This file is part of Shark.
27  * <http://shark-ml.org/>
28  *
29  * Shark is free software: you can redistribute it and/or modify
30  * it under the terms of the GNU Lesser General Public License as published
31  * by the Free Software Foundation, either version 3 of the License, or
32  * (at your option) any later version.
33  *
34  * Shark is distributed in the hope that it will be useful,
35  * but WITHOUT ANY WARRANTY; without even the implied warranty of
36  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37  * GNU Lesser General Public License for more details.
38  *
39  * You should have received a copy of the GNU Lesser General Public License
40  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41  *
42  */
43 //===========================================================================
44 
45 #ifndef SHARK_DATA_DATASET_H
46 #define SHARK_DATA_DATASET_H
47 
48 #include <boost/range/iterator_range.hpp>
49 
50 #include <shark/Core/Exception.h>
51 #include <shark/Core/OpenMP.h>
53 #include <boost/iterator/transform_iterator.hpp>
54 #include <shark/Core/Random.h>
55 #include <shark/Core/Shape.h>
56 #include "Impl/Dataset.inl"
57 
58 namespace shark {
59 
60 
61 ///
62 /// \brief Data container.
63 ///
64 /// The Data class is Shark's container for machine learning data.
65 /// This container (and its sub-classes) is used for input data,
66 /// labels, and model outputs.
67 ///
68 /// \par
69 /// The Data container organizes the data it holds in batches.
70 /// This means, that it tries to find a good data representation for a whole
71 /// set of, for example 100 data points, at the same time. If the type of data it stores
72 /// is for example RealVector, the batches of this type are RealMatrices. This is good because most often
73 /// operations on the whole matrix are faster than operations on the separate vectors.
74 /// Nearly all operations of the set have to be interpreted in terms of the batch. Therefore the iterator interface will
75 /// give access to the batches but not to single elements. For this separate element_iterators and const_element_iterators
76 /// can be used.
77 ///\par
78 ///When you need to explicitely iterate over all elements, you can use:
79 ///\code
80 /// Data<RealVector> data;
81 /// for(auto elem: data.elements()){
82 /// std::cout<<*pos<<" ";
83 /// auto ref=*pos;
84 /// ref*=2;
85 /// std::cout<<*pos<<std::endl;
86 ///}
87 ///\endcode
88 /// \par
89 /// Element wise accessing of elements is usually slower than accessing the batches. If possible, use direct batch access, or
90 /// at least use the iterator interface or the for loop above to iterate over all elements. Random access to single elements is linear time, so use it wisely.
91 /// Of course, when you want to use batches, you need to know the actual batch type. This depends on the actual type of the input.
92 /// here are the rules:
93 /// if the input is an arithmetic type like int or double, the result will be a vector of this
94 /// (i.e. double->RealVector or Int->IntVector).
95 /// For vectors the results are matrices as mentioned above. If the vector is sparse, so is the matrix.
96 /// And for everything else the batch type is just a std::vector of the type, so no optimization can be applied.
97 /// \par
98 /// When constructing the container the batchSize can be set. If it is not set by the user the default batchSize is chosen. A BatchSize of 0
99 /// corresponds to putting all data into a single batch. Beware that not only the data needs storage but also
100 /// the various models during computation. So the actual amount of space to compute a batch can greatly exceed the batch size.
101 ///
102 /// An additional feature of the Data class is that it can be used to create lazy subsets. So the batches of a dataset
103 /// can be shared between various instances of the data class without additional memory overhead.
104 ///
105 ///
106 ///\warning Be aware --especially for derived containers like LabeledData-- that the set does not enforce structural consistency.
107 /// When you change the structure of the data part for example by directly changing the size of the batches, the size of the labels is not
108 /// enforced to change accordingly. Also when creating subsets of a set changing the parent will change it's siblings and conversely. The programmer
109 /// needs to ensure structural integrity!
110 /// For example this is dangerous:
111 /// \code
112 /// void function(Data<unsigned int>& data){
113 /// Data<unsigned int> newData(...);
114 /// data=newData;
115 /// }
116 /// \endcode
117 /// When data was originally a labeledData object, and newData has a different batch structure than data, this will lead to structural inconsistencies.
118 /// When function is rewritten such that newData has the same structure as data, this code is perfectly fine. The best way to get around this problem is
119 /// by rewriting the code as:
120 /// \code
121 /// Data<unsigned int> function(){
122 /// Data<unsigned int> newData(...);
123 /// return newData;
124 /// }
125 /// \endcode
126 ///\todo expand docu
127 template <class Type>
128 class Data : public ISerializable
129 {
130 protected:
131  typedef detail::SharedContainer<Type> Container;
132 
133  Container m_data;///< data
134  Shape m_shape;///< shape of a datapoint
135 public:
136  /// \brief Defines the default batch size of the Container.
137  ///
138  /// Zero means: unlimited
139  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = 256);
140 
141  typedef typename Container::BatchType batch_type;
142  typedef batch_type& batch_reference;
143  typedef batch_type const& const_batch_reference;
144 
145  typedef Type element_type;
148 
149  typedef std::vector<std::size_t> IndexSet;
150 
151  /// \brief Two containers compare equal if they share the same data.
152  template <class T> bool operator == (const Data<T>& rhs) {
153  return (m_data == rhs.m_data);
154  }
155 
156  /// \brief Two containers compare unequal if they don't share the same data.
157  template <class T> bool operator != (const Data<T>& rhs) {
158  return (! (*this == rhs));
159  }
160 
161  template <class InputT, class LabelT> friend class LabeledData;
162 
163  // RANGES
164  typedef boost::iterator_range< detail::DataElementIterator<Data<Type> > > element_range;
165  typedef boost::iterator_range< detail::DataElementIterator<Data<Type> const> > const_element_range;
166  typedef detail::BatchRange<Data<Type> > batch_range;
167  typedef detail::BatchRange<Data<Type> const> const_batch_range;
168 
169 
170  ///\brief Returns the range of elements.
171  ///
172  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
173  ///element access via begin()/end() in which case data.elements() provides the correct interface
174  const_element_range elements()const{
175  return const_element_range(
176  detail::DataElementIterator<Data<Type> const>(this,0,0,0),
177  detail::DataElementIterator<Data<Type> const>(this,numberOfBatches(),0,numberOfElements())
178  );
179  }
180  ///\brief Returns therange of elements.
181  ///
182  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
183  ///element access via begin()/end() in which case data.elements() provides the correct interface
184  element_range elements(){
185  return element_range(
186  detail::DataElementIterator<Data<Type> >(this,0,0,0),
187  detail::DataElementIterator<Data<Type> >(this,numberOfBatches(),0,numberOfElements())
188  );
189  }
190 
191  ///\brief Returns the range of batches.
192  ///
193  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
194  ///element access via begin()/end() in which case data.elements() provides the correct interface
195  const_batch_range batches()const{
196  return const_batch_range(this);
197  }
198  ///\brief Returns the range of batches.
199  ///
200  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
201  ///element access via begin()/end() in which case data.elements() provides the correct interface
202  batch_range batches(){
203  return batch_range(this);
204  }
205 
206  ///\brief Returns the number of batches of the set.
207  std::size_t numberOfBatches() const{
208  return m_data.size();
209  }
210  ///\brief Returns the total number of elements.
211  std::size_t numberOfElements() const{
212  return m_data.numberOfElements();
213  }
214 
215 
216  ///\brief Returns the shape of the elements in the dataset.
217  Shape const& shape() const{
218  return m_shape;
219  }
220 
221  ///\brief Returns the shape of the elements in the dataset.
223  return m_shape;
224  }
225 
226  ///\brief Check whether the set is empty.
227  bool empty() const{
228  return m_data.empty();
229  }
230 
231  // ELEMENT ACCESS
232  element_reference element(std::size_t i){
233  return *(detail::DataElementIterator<Data<Type> >(this,0,0,0)+i);
234  }
235  const_element_reference element(std::size_t i) const{
236  return *(detail::DataElementIterator<Data<Type> const>(this,0,0,0)+i);
237  }
238 
239  // BATCH ACCESS
240  batch_reference batch(std::size_t i){
241  return *(m_data.begin()+i);
242  }
243  const_batch_reference batch(std::size_t i) const{
244  return *(m_data.begin()+i);
245  }
246 
247  // CONSTRUCTORS
248 
249  ///\brief Constructor which constructs an empty set
250  Data(){ }
251 
252  ///\brief Construct a dataset with empty batches.
253  explicit Data(std::size_t numBatches) : m_data( numBatches )
254  { }
255 
256  ///\brief Construction with size and a single element
257  ///
258  /// Optionally the desired batch Size can be set
259  ///
260  ///@param size the new size of the container
261  ///@param element the blueprint element from which to create the Container
262  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
263  explicit Data(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
264  : m_data(size,element,batchSize)
265  { }
266 
267  // MISC
268 
269  void read(InArchive& archive){
270  archive >> m_data;
271  archive >> m_shape;
272  }
273 
274  void write(OutArchive& archive) const{
275  archive << m_data;
276  archive << m_shape;
277  }
278  ///\brief This method makes the vector independent of all siblings and parents.
279  virtual void makeIndependent(){
280  m_data.makeIndependent();
281  }
282 
283 
284  // METHODS TO ALTER BATCH STRUCTURE
285 
286  void splitBatch(std::size_t batch, std::size_t elementIndex){
287  m_data.splitBatch(m_data.begin()+batch,elementIndex);
288  }
289 
290  ///\brief Splits the container into two independent parts. The front part remains in the container, the back part is returned.
291  ///
292  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
293  ///this to work.
294  Data splice(std::size_t batch){
295  Data right;
296  right.m_data = m_data.splice(m_data.begin()+batch);
297  right.m_shape = m_shape;
298  return right;
299  }
300 
301  /// \brief Appends the contents of another data object to the end
302  ///
303  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
304  /// dataset might change this one as well.
305  void append(Data const& other){
306  m_data.append(other.m_data);
307  }
308 
309  void push_back(const_batch_reference batch){
310  m_data.push_back(batch);
311  }
312 
313  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
314  ///
315  ///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
316  ///However the sum of all batch sizes must be equal to the current number of elements
317  template<class Range>
318  void repartition(Range const& batchSizes){
319  m_data.repartition(batchSizes);
320  }
321 
322  /// \brief Creates a vector with the batch sizes of every batch.
323  ///
324  /// This method can be used together with repartition to ensure
325  /// that two datasets have the same batch structure.
326  std::vector<std::size_t> getPartitioning()const{
327  return m_data.getPartitioning();
328  }
329 
330  // SUBSETS
331 
332  ///\brief Fill in the subset defined by the list of indices as well as its complement.
333  void indexedSubset(IndexSet const& indices, Data& subset, Data& complement) const{
334  IndexSet comp;
335  detail::complement(indices,m_data.size(),comp);
336  subset.m_data=Container(m_data,indices);
337  complement.m_data=Container(m_data,comp);
338  }
339 
340  Data indexedSubset(IndexSet const& indices) const{
341  Data subset;
342  subset.m_data = Container(m_data,indices);
343  subset.m_shape = m_shape;
344  return subset;
345  }
346 
347  friend void swap(Data& a, Data& b){
348  swap(a.m_data,b.m_data);
350  }
351 };
352 
353 /**
354  * \ingroup shark_globals
355  * @{
356  */
357 
358 /// Outstream of elements.
359 template<class T>
360 std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
361  for(auto elem:d.elements())
362  stream << elem << "\n";
363  return stream;
364 }
365 /** @} */
366 
367 /// \brief Data set for unsupervised learning.
368 ///
369 /// The UnlabeledData class is basically a standard Data container
370 /// with the special interpretation of its data point being
371 /// "inputs" to a learning algorithm.
372 template <class InputT>
373 class UnlabeledData : public Data<InputT>
374 {
375 public:
376  typedef InputT element_type;
378  typedef element_type InputType;
379  typedef detail::SharedContainer<InputT> InputContainer;
380 
381 protected:
382  using base_type::m_data;
383 public:
384 
385  ///\brief Constructor.
387  { }
388 
389  ///\brief Construction from data.
391  : base_type(points)
392  { }
393 
394  ///\brief Construction with size and a single element
395  ///
396  /// Optionally the desired batch Size can be set
397  ///
398  ///@param size the new size of the container
399  ///@param element the blueprint element from which to create the Container
400  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
401  UnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = base_type::DefaultBatchSize)
402  : base_type(size,element,batchSize)
403  { }
404 
405  ///\brief Create an empty set with just the correct number of batches.
406  ///
407  /// The user must initialize the dataset after that by himself.
408  UnlabeledData(std::size_t numBatches)
409  : base_type(numBatches)
410  { }
411 
412  ///\brief Construct a dataset with different batch sizes. it is a copy of the other dataset
413  UnlabeledData(UnlabeledData const& container, std::vector<std::size_t> batchSizes)
414  :base_type(container,batchSizes){}
415 
416  /// \brief we allow assignment from Data.
418  static_cast<Data<InputT>& >(*this) = data;
419  return *this;
420  }
421 
422  ///\brief Access to the base_type class as "inputs".
423  ///
424  /// Added for consistency with the LabeledData::labels() method.
426  return *this;
427  }
428 
429  ///\brief Access to the base_type class as "inputs".
430  ///
431  /// Added for consistency with the LabeledData::labels() method.
432  UnlabeledData const& inputs() const{
433  return *this;
434  }
435 
436  ///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
437  ///
438  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
439  ///this to work.
440  UnlabeledData splice(std::size_t batch){
441  UnlabeledData right;
442  right.m_data = m_data.splice(m_data.begin()+batch);
443  right.m_shape = this->m_shape;
444  return right;
445  }
446 
447  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
448  virtual void shuffle(){
449  shark::shuffle(this->elements().begin(),this->elements().end(), random::globalRng);
450  }
451 };
452 
453 ///
454 /// \brief Data set for supervised learning.
455 ///
456 /// The LabeledData class extends UnlabeledData for the
457 /// representation of inputs. In addition it holds and
458 /// provides access to the corresponding labels.
459 ///
460 /// LabeledData tries to mimic the underlying data as pairs of input and label data.
461 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
462 /// one access the input batch by batch(i).input and the labels by batch(i).label
463 ///
464 ///this also holds true for single element access using operator(). Be aware, that direct access to an element is
465 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
466 template <class InputT, class LabelT>
468 {
469 public:
470  typedef InputT InputType;
471  typedef LabelT LabelType;
475 
476  static const std::size_t DefaultBatchSize = InputContainer::DefaultBatchSize;
477 
478  // TYPEDEFS FOR PAIRS
479  typedef InputLabelBatch<
480  typename Batch<InputType>::type,
481  typename Batch<LabelType>::type
482  > batch_type;
483 
484  typedef InputLabelPair<InputType,LabelType> element_type;
485 
486  // TYPEDEFS FOR REFERENCES
487  typedef InputLabelBatch<
488  typename Batch<InputType>::type&,
489  typename Batch<LabelType>::type&
491  typedef InputLabelBatch<
492  typename Batch<InputType>::type const&,
493  typename Batch<LabelType>::type const&
495 
496  typedef typename batch_reference::reference element_reference;
497  typedef typename const_batch_reference::const_reference const_element_reference;
498 
499  typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> > > element_range;
500  typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> const> > const_element_range;
501  typedef detail::BatchRange<LabeledData<InputType,LabelType> > batch_range;
502  typedef detail::BatchRange<LabeledData<InputType,LabelType> const> const_batch_range;
503 
504 
505  ///\brief Returns the range of elements.
506  ///
507  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
508  ///element access via begin()/end() in which case data.elements() provides the correct interface
509  const_element_range elements()const{
510  return const_element_range(
511  detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,0,0,0),
512  detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,numberOfBatches(),0,numberOfElements())
513  );
514  }
515  ///\brief Returns therange of elements.
516  ///
517  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
518  ///element access via begin()/end() in which case data.elements() provides the correct interface
519  element_range elements(){
520  return element_range(
521  detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,0,0,0),
522  detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,numberOfBatches(),0,numberOfElements())
523  );
524  }
525 
526  ///\brief Returns the range of batches.
527  ///
528  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
529  ///element access via begin()/end() in which case data.elements() provides the correct interface
530  const_batch_range batches()const{
531  return const_batch_range(this);
532  }
533  ///\brief Returns the range of batches.
534  ///
535  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
536  ///element access via begin()/end() in which case data.elements() provides the correct interface
537  batch_range batches(){
538  return batch_range(this);
539  }
540 
541  ///\brief Returns the number of batches of the set.
542  std::size_t numberOfBatches() const{
543  return m_data.numberOfBatches();
544  }
545  ///\brief Returns the total number of elements.
546  std::size_t numberOfElements() const{
547  return m_data.numberOfElements();
548  }
549 
550  ///\brief Check whether the set is empty.
551  bool empty() const{
552  return m_data.empty();
553  }
554 
555  ///\brief Access to inputs as a separate container.
556  InputContainer const& inputs() const{
557  return m_data;
558  }
559  ///\brief Access to inputs as a separate container.
560  InputContainer& inputs(){
561  return m_data;
562  }
563 
564  ///\brief Access to labels as a separate container.
565  LabelContainer const& labels() const{
566  return m_label;
567  }
568  ///\brief Access to labels as a separate container.
569  LabelContainer& labels(){
570  return m_label;
571  }
572 
573  // CONSTRUCTORS
574 
575  ///\brief Empty data set.
577  {}
578 
579  ///\brief Create an empty set with just the correct number of batches.
580  ///
581  /// The user must initialize the dataset after that by himself.
582  LabeledData(std::size_t numBatches)
583  : m_data(numBatches),m_label(numBatches)
584  {}
585 
586  ///
587  /// Optionally the desired batch Size can be set
588  ///
589  ///@param size the new size of the container
590  ///@param element the blueprint element from which to create the Container
591  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
592  LabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
593  : m_data(size,element.input,batchSize),
594  m_label(size,element.label,batchSize)
595  {}
596 
597  ///\brief Construction from data.
598  ///
599  /// Beware that when calling this constructor the organization of batches must be equal in both
600  /// containers. This Constructor will not split the data!
601  LabeledData(Data<InputType> const& inputs, Data<LabelType> const& labels)
602  : m_data(inputs), m_label(labels)
603  {
604  SHARK_RUNTIME_CHECK(inputs.numberOfElements() == labels.numberOfElements(), "number of inputs and number of labels must agree");
605 #ifndef DNDEBUG
606  for(std::size_t i = 0; i != inputs.numberOfBatches(); ++i){
608  }
609 #endif
610  }
611  // ELEMENT ACCESS
612  element_reference element(std::size_t i){
613  return *(detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,0,0,0)+i);
614  }
615  const_element_reference element(std::size_t i) const{
616  return *(detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,0,0,0)+i);
617  }
618 
619  // BATCH ACCESS
620  batch_reference batch(std::size_t i){
621  return batch_reference(m_data.batch(i),m_label.batch(i));
622  }
623  const_batch_reference batch(std::size_t i) const{
624  return const_batch_reference(m_data.batch(i),m_label.batch(i));
625  }
626 
627  ///\brief Returns the Shape of the inputs.
628  Shape const& inputShape() const{
629  return m_data.shape();
630  }
631 
632  ///\brief Returns the Shape of the inputs.
634  return m_data.shape();
635  }
636 
637  ///\brief Returns the Shape of the labels.
638  Shape const& labelShape() const{
639  return m_label.shape();
640  }
641 
642  ///\brief Returns the Shape of the labels.
644  return m_label.shape();
645  }
646 
647  // MISC
648 
649  /// from ISerializable
650  void read(InArchive& archive){
651  archive & m_data;
652  archive & m_label;
653  }
654 
655  /// from ISerializable
656  void write(OutArchive& archive) const{
657  archive & m_data;
658  archive & m_label;
659  }
660 
661  ///\brief This method makes the vector independent of all siblings and parents.
662  virtual void makeIndependent(){
663  m_label.makeIndependent();
664  m_data.makeIndependent();
665  }
666 
667  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
668  virtual void shuffle(){
669  shark::shuffle(this->elements().begin(),this->elements().end(), random::globalRng);
670  }
671 
672  void splitBatch(std::size_t batch, std::size_t elementIndex){
673  m_data.splitBatch(batch,elementIndex);
674  m_label.splitBatch(batch,elementIndex);
675  }
676 
677  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
678  ///
679  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
680  ///this to work.
681  LabeledData splice(std::size_t batch){
682  return LabeledData(m_data.splice(batch),m_label.splice(batch));
683  }
684 
685  /// \brief Appends the contents of another data object to the end
686  ///
687  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
688  /// dataset might change this one as well.
689  void append(LabeledData const& other){
690  m_data.append(other.m_data);
691  m_label.append(other.m_label);
692  }
693 
694  void push_back(
695  typename Batch<InputType>::type const& inputs,
696  typename Batch<LabelType>::type const& labels
697  ){
698  m_data.push_back(inputs);
699  m_label.push_back(labels);
700  }
701 
702  void push_back(
704  ){
705  push_back(batch.input,batch.label);
706  }
707 
708 
709  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
710  ///
711  ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
712  ///However the sum of all batch sizes must be equal to the current number of elements
713  template<class Range>
714  void repartition(Range const& batchSizes){
715  m_data.repartition(batchSizes);
716  m_label.repartition(batchSizes);
717  }
718 
719  /// \brief Creates a vector with the batch sizes of every batch.
720  ///
721  /// This method can be used together with repartition to ensure
722  /// that two datasets have the same batch structure.
723  std::vector<std::size_t> getPartitioning()const{
724  return m_data.getPartitioning();
725  }
726 
727  friend void swap(LabeledData& a, LabeledData& b){
728  swap(a.m_data,b.m_data);
729  swap(a.m_label,b.m_label);
730  }
731 
732 
733  // SUBSETS
734 
735  ///\brief Fill in the subset defined by the list of indices.
736  LabeledData indexedSubset(IndexSet const& indices) const{
737  return LabeledData(m_data.indexedSubset(indices),m_label.indexedSubset(indices));
738  }
739 protected:
740  InputContainer m_data; /// point data
741  LabelContainer m_label; /// label data
742 };
743 
744 /// specialized template for classification with unsigned int labels
746 
747 /// specialized template for regression with RealVector labels
749 
750 /// specialized template for classification with unsigned int labels and sparse data
752 
753 template<class Functor, class T>
756 };
757 
758 namespace detail{
759 template<class T>
760 struct InferShape{
761  static Shape infer(T const&){return {};}
762 };
763 
764 template<class T>
765 struct InferShape<Data<blas::vector<T> > >{
766  static Shape infer(Data<blas::vector<T> > const& f){
767  return {f.element(0).size()};
768  }
769 };
770 
771 template<class T>
772 struct InferShape<Data<blas::compressed_vector<T> > >{
773  static Shape infer(Data<blas::compressed_vector<T> > const& f){
774  return {f.element(0).size()};
775  }
776 };
777 
778 }
779 
780 /**
781  * \addtogroup shark_globals
782  * @{
783  */
784 
785 /// \brief creates a data object from a range of elements
786 template<class Range>
788 createDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
789  typedef typename Range::value_type Input;
790 
791  if (maximumBatchSize == 0)
792  maximumBatchSize = Data<Input>::DefaultBatchSize;
793 
794  std::size_t numPoints = inputs.size();
795  //first determine the optimal number of batches as well as batch size
796  std::size_t batches = numPoints / maximumBatchSize;
797  if(numPoints > batches*maximumBatchSize)
798  ++batches;
799  std::size_t optimalBatchSize=numPoints/batches;
800  std::size_t remainder = numPoints-batches*optimalBatchSize;
801  Data<Input> data(batches);
802 
803  //now create the batches taking the remainder into account
804  auto start= inputs.begin();
805  for(std::size_t i = 0; i != batches; ++i){
806  std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
807  auto end = start+size;
808  data.batch(i) = createBatch<Input>(
809  boost::make_iterator_range(start,end)
810  );
811  start = end;
812  }
813  data.shape() = detail::InferShape<Data<Input> >::infer(data);
814  return data;
815 }
816 
817 /// \brief creates a data object from a range of elements
818 template<class Range>
820 createUnlabeledDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
821  return createDataFromRange(inputs,maximumBatchSize);
822 }
823 /// \brief creates a labeled data object from two ranges, representing inputs and labels
824 template<class Range1, class Range2>
826  typename boost::range_value<Range1>::type,
827  typename boost::range_value<Range2>::type
828 
829 > createLabeledDataFromRange(Range1 const& inputs, Range2 const& labels, std::size_t maximumBatchSize = 0){
830  SHARK_RUNTIME_CHECK(inputs.size() == labels.size(),"Number of inputs and number of labels must agree");
831  typedef typename boost::range_value<Range1>::type Input;
832  typedef typename boost::range_value<Range2>::type Label;
833 
834  if (maximumBatchSize == 0)
836 
838  createDataFromRange(inputs,maximumBatchSize),
839  createDataFromRange(labels,maximumBatchSize)
840  );
841 }
842 
843 ///brief Outstream of elements for labeled data.
844 template<class T, class U>
845 std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
846  for(auto elem: d.elements())
847  stream << elem.input << " [" << elem.label <<"]"<< "\n";
848  return stream;
849 }
850 
851 
852 // FUNCTIONS FOR DIMENSIONALITY
853 
854 
855 ///\brief Return the number of classes of a set of class labels with unsigned int label encoding
856 inline unsigned int numberOfClasses(Data<unsigned int> const& labels){
857  unsigned int classes = 0;
858  for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
859  classes = std::max(classes,*std::max_element(labels.batch(i).begin(),labels.batch(i).end()));
860  }
861  return classes+1;
862 }
863 
864 ///\brief Returns the number of members of each class in the dataset.
865 inline std::vector<std::size_t> classSizes(Data<unsigned int> const& labels){
866  std::vector<std::size_t> classCounts(numberOfClasses(labels),0u);
867  for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
868  for(unsigned int elem: labels.batch(i)){
869  classCounts[elem]++;
870  }
871  }
872  return classCounts;
873 }
874 
875 ///\brief Return the dimensionality of a dataset.
876 template <class InputType>
877 std::size_t dataDimension(Data<InputType> const& dataset){
878  SHARK_ASSERT(dataset.numberOfElements() > 0);
879  return dataset.element(0).size();
880 }
881 
882 ///\brief Return the input dimensionality of a labeled dataset.
883 template <class InputType, class LabelType>
885  return dataDimension(dataset.inputs());
886 }
887 
888 ///\brief Return the label/output dimensionality of a labeled dataset.
889 template <class InputType, class LabelType>
891  return dataDimension(dataset.labels());
892 }
893 ///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
894 template <class InputType>
896  return numberOfClasses(dataset.labels());
897 }
898 ///\brief Returns the number of members of each class in the dataset.
899 template<class InputType, class LabelType>
900 inline std::vector<std::size_t> classSizes(LabeledData<InputType, LabelType> const& dataset){
901  return classSizes(dataset.labels());
902 }
903 
904 // TRANSFORMATION
905 ///\brief Transforms a dataset using a Functor f and returns the transformed result.
906 ///
907 /// this version is used, when the Functor supports only element-by-element transformations
908 template<class T,class Functor>
909 typename boost::lazy_disable_if<
912 >::type
913 transform(Data<T> const& data, Functor f){
914  typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
915  int batches = (int) data.numberOfBatches();
916  Data<ResultType> result(batches);
917  SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
918  result.batch(i)= createBatch<ResultType>(
919  boost::make_transform_iterator(batchBegin(data.batch(i)), f),
920  boost::make_transform_iterator(batchEnd(data.batch(i)), f)
921  );
922  result.shape() = detail::InferShape<Data<ResultType> >::infer(result);
923  return result;
924 }
925 
926 ///\brief Transforms a dataset using a Functor f and returns the transformed result.
927 ///
928 /// this version is used, when the Functor supports batch-by-batch transformations
929 template<class T,class Functor>
930 typename boost::lazy_enable_if<
931  CanBeCalled<Functor,typename Data<T>::batch_type>,
933 >::type
934 transform(Data<T> const& data, Functor const& f){
935  typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
936  int batches = (int) data.numberOfBatches();
937  Data<ResultType> result(batches);
938  SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
939  result.batch(i)= f(data.batch(i));
940  Shape shape = detail::InferShape<Functor>::infer(f);
941  if(shape == Shape()){
942  shape = detail::InferShape<Data<ResultType> >::infer(result);
943  }
944  result.shape() = shape;
945  return result;
946 }
947 
948 ///\brief Transforms the inputs of a dataset and return the transformed result.
949 template<class I,class L, class Functor>
951 transformInputs(LabeledData<I,L> const& data, Functor const& f){
953  return DatasetType(transform(data.inputs(),f),data.labels());
954 }
955 ///\brief Transforms the labels of a dataset and returns the transformed result.
956 template<class I,class L, class Functor>
958 transformLabels(LabeledData<I,L> const& data, Functor const& f){
960  return DatasetType(data.inputs(),transform(data.labels(),f));
961 }
962 
963 ///\brief Creates a copy of a dataset selecting only a certain set of features.
964 template<class T, class FeatureSet>
965 Data<blas::vector<T> > selectFeatures(Data<blas::vector<T> > const& data,FeatureSet const& features){
966  auto select = [&](blas::matrix<T> const& input){
967  blas::matrix<T> output(input.size1(),features.size());
968  for(std::size_t i = 0; i != input.size1(); ++i){
969  for(std::size_t j = 0; j != features.size(); ++j){
970  output(i,j) = input(i,features[j]);
971  }
972  }
973  return output;
974  };
975  return transform(data,select);
976 }
977 
978 template<class T, class FeatureSet>
980  return LabeledData<RealVector,T>(selectFeatures(data.inputs(),features), data.labels());
981 }
982 
983 
984 
985 /// \brief Removes the last part of a given dataset and returns a new split containing the removed elements
986 ///
987 /// For this operation, the dataset is not allowed to be shared.
988 /// \brief data The dataset which should be splited
989 /// \brief index the first element to be split
990 /// \returns the set which contains the splitd element (right part of the given set)
991 template<class DatasetT>
992 DatasetT splitAtElement(DatasetT& data, std::size_t elementIndex){
993  SIZE_CHECK(elementIndex<=data.numberOfElements());
994 
995  std::size_t batchPos = 0;
996  std::size_t batchStart = 0;
997  while(batchStart + batchSize(data.batch(batchPos)) < elementIndex){
998  batchStart += batchSize(data.batch(batchPos));
999  ++batchPos;
1000  };
1001  std::size_t splitPoint = elementIndex-batchStart;
1002  if(splitPoint != 0){
1003  data.splitBatch(batchPos,splitPoint);
1004  ++batchPos;
1005  }
1006 
1007  return data.splice(batchPos);
1008 }
1009 
1010 
1011 ///\brief reorders the dataset such, that points are grouped by labels
1012 ///
1013 /// The elements are not only reordered but the batches are also resized such, that every batch
1014 /// only contains elements of one class. This method must be used in order to use binarySubproblem.
1015 template<class I>
1017  std::vector<std::size_t > classCounts = classSizes(data);
1018  std::vector<std::size_t > partitioning;//new, optimal partitioning of the data according to the batch sizes
1019  std::vector<std::size_t > classStart;//at which batch the elements of the class are starting
1020  detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);
1021 
1022  data.repartition(partitioning);
1023 
1024  // Now place examples into the batches reserved for their class...
1025  std::vector<std::size_t> bat = classStart; // batch index until which the class is already filled in
1026  std::vector<std::size_t> idx(classStart.size(), 0); // index within the batch until which the class is already filled in
1027  std::size_t c = 0; // the current class space we are operating in
1028  for (std::size_t b=0; b<data.numberOfBatches(); b++){
1029  //check if we changed class space
1030  if(b == classStart[c+1]) ++c;
1031  auto&& batch = data.batch(b);
1032  std::size_t e = 0;
1033  while (true){
1034  unsigned int label = batch.label[e];
1035  if (label == c){ // leave element in place
1036  ++e;
1037  ++idx[c];
1038  if (e == batch.size())
1039  {
1040  e = 0;
1041  idx[c] = 0;
1042  bat[c]++;
1043  break;
1044  }
1045  }
1046  else{ // swap elements
1047  auto&& batch2 = data.batch(bat[label]);
1048  using std::swap;
1049  swap(batch[e],batch2[idx[label]]);
1050  idx[label]++;
1051  if (idx[label] == batch2.size())
1052  {
1053  idx[label] = 0;
1054  bat[label]++;
1055  }
1056  }
1057  }
1058  }
1059 }
1060 
1061 template<class I>
1063  LabeledData<I,unsigned int>const& data,
1064  unsigned int zeroClass,
1065  unsigned int oneClass
1066 ){
1067  std::vector<std::size_t> indexSet;
1068  std::size_t smaller = std::min(zeroClass,oneClass);
1069  std::size_t bigger = std::max(zeroClass,oneClass);
1070  std::size_t numBatches = data.numberOfBatches();
1071 
1072  //find first class
1073  std::size_t start= 0;
1074  for(;start != numBatches && getBatchElement(data.batch(start),0).label != smaller;++start);
1075  SHARK_RUNTIME_CHECK(start != numBatches, "First class does not exist");
1076 
1077  //copy batch indices of first class
1078  for(;start != numBatches && getBatchElement(data.batch(start),0).label == smaller; ++start)
1079  indexSet.push_back(start);
1080 
1081  //find second class
1082 
1083  for(;start != numBatches && getBatchElement(data.batch(start),0).label != bigger;++start);
1084  SHARK_RUNTIME_CHECK(start != numBatches, "Second class does not exist");
1085 
1086  //copy batch indices of second class
1087  for(;start != numBatches && getBatchElement(data.batch(start),0).label == bigger; ++start)
1088  indexSet.push_back(start);
1089 
1090  return transformLabels(data.indexedSubset(indexSet), [=](unsigned int label){return (unsigned int)(label == oneClass);});
1091 }
1092 
1093 /// \brief Construct a binary (two-class) one-versus-rest problem from a multi-class problem.
1094 ///
1095 /// \par
1096 /// The function returns a new LabeledData object. The input part
1097 /// coincides with the multi-class data, but the label part is replaced
1098 /// with binary labels 0 and 1. All instances of the given class
1099 /// (parameter oneClass) get a label of one, all others are assigned a
1100 /// label of zero.
1101 template<class I>
1103  LabeledData<I,unsigned int>const& data,
1104  unsigned int oneClass
1105 ){
1106  return transformLabels(data, [=](unsigned int label){return (unsigned int)(label == oneClass);});
1107 }
1108 
1109 template <typename RowType>
1110 RowType getColumn(Data<RowType> const& data, std::size_t columnID) {
1111  SHARK_ASSERT(dataDimension(data) > columnID);
1112  RowType column(data.numberOfElements());
1113  std::size_t rowCounter = 0;
1114  for(auto element: data.elements()){
1115  column(rowCounter) = element(columnID);
1116  rowCounter++;
1117  }
1118  return column;
1119 }
1120 
1121 template <typename RowType>
1122 void setColumn(Data<RowType>& data, std::size_t columnID, RowType newColumn) {
1123  SHARK_ASSERT(dataDimension(data) > columnID);
1124  SHARK_ASSERT(data.numberOfElements() == newColumn.size());
1125  std::size_t rowCounter = 0;
1126  for(auto element: data.elements()){
1127  element(columnID) = newColumn(rowCounter);
1128  rowCounter++;
1129  }
1130 }
1131 
1132 /** @*/
1133 }
1134 
1135 #endif