DataView.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Fast lookup for elements in constant datasets
6  *
7  *
8  *
9  *
10  *
11  * \author O. Krause
12  * \date 2012
13  *
14  *
15  * \par Copyright 1995-2017 Shark Development Team
16  *
17  * <BR><HR>
18  * This file is part of Shark.
19  * <http://shark-ml.org/>
20  *
21  * Shark is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published
23  * by the Free Software Foundation, either version 3 of the License, or
24  * (at your option) any later version.
25  *
26  * Shark is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29  * GNU Lesser General Public License for more details.
30  *
31  * You should have received a copy of the GNU Lesser General Public License
32  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33  *
34  */
35 //===========================================================================
36 
37 
38 #ifndef SHARK_DATA_DATAVIEW_H
39 #define SHARK_DATA_DATAVIEW_H
40 
41 #include <shark/Data/Dataset.h>
43 #include <numeric>
44 namespace shark {
45 /// \brief Constant time Element-Lookup for Datasets
46 ///
47 /// Datasets are fast for random lookup of batches. Since batch sizes can be arbitrary structured and
48 /// changed by the user, there is no way for the Data and LabeledData classes to provide fast random access
49 /// to single elements. Still, this property is needed quite often, for example for creating subsets,
50 /// randomize data or tree structures.
51 /// A View stores the position of every element in a dataset. So it has constant time access to the elements but
52 /// it also requires linear memory in the number of elements in the set. This is typically small compared
53 /// to the size of the set itself, but construction imposes an considerable overhead.
54 ///
55 /// In contrast to (Un)LabeledData, which is centered around batches, the View is centered around single elements,
56 /// so its iterators iterate over the elements.
57 /// For a better support for bagging an index method is added which returns the position of the element in the
58 /// underlying data container. Also the iterators are indexed and return this index.
59 template <class DatasetType> //template parameter can be const!
60 class DataView
61 {
62 public:
63  typedef typename std::remove_const<DatasetType>::type dataset_type; //(non const) type of the underlying dataset
64  typedef typename dataset_type::element_type value_type;
65  typedef typename dataset_type::const_element_reference const_reference;
66  typedef typename dataset_type::batch_type batch_type;
67  // We want to support immutable as well as mutable datasets. So we query whether the dataset
68  // is mutable and change the reference type to const if the dataset is immutable.
69  typedef typename boost::mpl::if_<
70  std::is_const<DatasetType>,
71  typename dataset_type::const_element_reference,
72  typename dataset_type::element_reference
73  >::type reference;
74 
75 private:
76  typedef typename boost::mpl::if_<
77  std::is_const<DatasetType>,
78  typename dataset_type::const_batch_range,
79  typename dataset_type::batch_range
80  >::type batch_range;
81  template<class Reference, class View>
82  class IteratorBase: public SHARK_ITERATOR_FACADE<
83  IteratorBase<Reference,View>,
84  value_type,
85  std::random_access_iterator_tag,
86  Reference
87  >{
88  public:
89  IteratorBase(){}
90 
91  IteratorBase(View& view, std::size_t position)
92  : mpe_view(&view),m_position(position) {}
93 
94  template<class R,class V>
95  IteratorBase(IteratorBase<R,V> const& other)
96  : mpe_view(other.mpe_view),m_position(other.position){}
97 
98  /// \brief returns the position of the element referenced by the iterator inside the dataset
99  ///
100  /// This is usefull for bagging, when identical elements between several susbsets are to be identified
101  std::size_t index()const{
102  return mpe_view->index(m_position);
103  }
104 
105  private:
106  friend class SHARK_ITERATOR_CORE_ACCESS;
107  template <class, class> friend class IteratorBase;
108 
109  void increment() {
110  ++m_position;
111  }
112  void decrement() {
113  --m_position;
114  }
115 
116  void advance(std::ptrdiff_t n){
117  m_position+=n;
118  }
119 
120  template<class R,class V>
121  std::ptrdiff_t distance_to(IteratorBase<R,V> const& other) const{
122  return (std::ptrdiff_t)other.m_position - (std::ptrdiff_t)m_position;
123  }
124 
125  template<class R,class V>
126  bool equal(IteratorBase<R,V> const& other) const{
127  return m_position == other.m_position;
128  }
129  Reference dereference() const {
130  return (*mpe_view)[m_position];
131  }
132 
133  View* mpe_view;
134  std::size_t m_position;
135  };
136 public:
137  typedef IteratorBase<reference,DataView<DatasetType> > iterator;
138  typedef IteratorBase<const_reference, DataView<DatasetType> const > const_iterator;
139 
141  DataView(DatasetType& dataset)
142  :m_dataset(dataset),m_indices(dataset.numberOfElements())
143  {
144  std::size_t index = 0;
145  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
146  std::size_t batchSize = Batch<value_type>::size(dataset.batch(i));
147  for(std::size_t j = 0; j != batchSize; ++j,++index){
148  m_indices[index].batch = i;
149  m_indices[index].positionInBatch = j;
150  m_indices[index].datasetIndex = index;
151  }
152  }
153  }
154 
155  /// create a subset of the dataset type using only the elemnt indexed by indices
156  template<class IndexRange>
157  DataView(DataView<DatasetType> const& view, IndexRange const& indices)
158  :m_dataset(view.m_dataset),m_indices(indices.size())
159  {
160  for(std::size_t i = 0; i != m_indices.size(); ++i)
161  m_indices[i] = view.m_indices[indices[i]];
162  }
163 
164  reference operator[](std::size_t position){
165  SIZE_CHECK(position < size());
166  Index const& index = m_indices[position];
167  return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch);
168  }
169  const_reference operator[](std::size_t position) const{
170  SIZE_CHECK(position < size());
171  Index const& index = m_indices[position];
172  return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch);
173  }
174 
175  reference front(){
176  SIZE_CHECK(size() != 0);
177  return (*this)[0];
178  }
179  const_reference front()const{
180  SIZE_CHECK(size() != 0);
181  return (*this)[0];
182  }
183  reference back(){
184  SIZE_CHECK(size() != 0);
185  return (*this)[size()-1];
186  }
187  const_reference back()const{
188  SIZE_CHECK(size() != 0);
189  return (*this)[size()-1];
190  }
191 
192  /// \brief Position of the element in the dataset.
193  ///
194  /// This is useful for bagging, when identical elements among
195  /// several subsets are to be identified.
196  std::size_t index(std::size_t position)const{
197  return m_indices[position].datasetIndex;
198  }
199 
200  /// \brief Index of the batch holding the element.
201  std::size_t batch(std::size_t position) const {
202  return m_indices[position].batch;
203  }
204 
205  /// \brief Index inside the batch holding the element.
206  std::size_t positionInBatch(std::size_t position) const {
207  return m_indices[position].positionInBatch;
208  }
209 
210  std::size_t size() const{
211  return m_indices.size();
212  }
213 
214  iterator begin(){
215  return iterator(*this, 0);
216  }
217  const_iterator begin() const{
218  return const_iterator(*this, 0);
219  }
220  iterator end(){
221  return iterator(*this, size());
222  }
223  const_iterator end() const{
224  return const_iterator(*this, size());
225  }
226 
227  dataset_type const& dataset()const{
228  return m_dataset;
229  }
230 private:
231  dataset_type m_dataset;
232  // Stores for an element of the dataset, at which position of which batch it is located
233  // as well as the real index of the element inside the dataset
234  struct Index{
235  std::size_t batch;//the batch in which the element is located
236  std::size_t positionInBatch;//at which position in the batch it is
237  std::size_t datasetIndex;//index inside the dataset
238  };
239  std::vector<Index> m_indices;//stores for every element of the set it's position inside the dataset
240 };
241 
242 /// \brief creates a subset of a DataView with elements indexed by indices
243 ///
244 /// \param view the view for which the subset is to be created
245 /// \param indizes the index of the elements to be stored in the view
246 template<class DatasetType,class IndexRange>
247 DataView<DatasetType> subset(DataView<DatasetType> const& view, IndexRange const& indizes){
248  //O.K. todo: Remove constructor later on, this is a quick fix.
249  return DataView<DatasetType>(view,indizes);
250 }
251 
252 /// \brief creates a random subset of a DataView with given size
253 ///
254 /// \param view the view for which the subset is to be created
255 /// \param size the size of the subset
256 template<class DatasetType>
258  std::vector<std::size_t> indices(view.size());
259  std::iota(indices.begin(),indices.end(),0);
260  partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
261  return subset(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
262 }
263 
264 /// \brief Creates a batch given a set of indices
265 ///
266 /// \param view the view from which the batch is to be created
267 /// \param indizes the set of indizes defining the batch
268 template<class DatasetType,class IndexRange>
269 typename DataView<DatasetType>::batch_type subBatch(
270  DataView<DatasetType> const& view,
271  IndexRange const& indizes
272 ){
273  //create a subset of the view containing the elements of the batch
274  DataView<DatasetType> batchElems = subset(view,indizes);
275 
276  //and now use the batch range construction to create it
277  return createBatch(batchElems);
278 }
279 
280 /// \brief Creates a random batch of a given size
281 ///
282 /// \param view the view from which the batch is to be created
283 /// \param size the size of the batch
284 template<class DatasetType>
285 typename DataView<DatasetType>::batch_type randomSubBatch(
286  DataView<DatasetType> const& view,
287  std::size_t size
288 ){
289  std::vector<std::size_t> indices(view.size());
290  std::iota(indices.begin(),indices.end(),0);
291  partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
292  return subBatch(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
293 }
294 
295 /// \brief Creates a View from a dataset.
296 ///
297 /// This is just a helper function to omit the actual type of the view
298 ///
299 /// \param set the dataset from which to create the view
300 template<class DatasetType>
301 DataView<DatasetType> toView(DatasetType& set){
302  return DataView<DatasetType>(set);
303 }
304 
305 /// \brief Creates a new dataset from a View.
306 ///
307 /// When the elements of a View needs to be processed repeatedly it is often better to use
308 /// the packed format of the Dataset again, since then the faster batch processing can be used
309 ///
310 /// \param view the view from which to create the new dataset
311 /// \param batchSize the size of the batches in the dataset
312 template<class T>
313 typename DataView<T>::dataset_type
315  if(view.size() == 0)
316  return typename DataView<T>::dataset_type();
317  //O.K. todo: this is slow for sparse elements, use subBatch or something similar.
318  std::size_t elements = view.size();
319  typename DataView<T>::dataset_type dataset(elements,view[0],batchSize);
320  std::copy(view.begin(),view.end(),dataset.elements().begin());
321  return dataset;
322 }
323 
324 /// Return the number of classes (size of the label vector)
325 /// of a classification dataset with RealVector label encoding.
326 template <class DatasetType>
327 std::size_t numberOfClasses(DataView<DatasetType> const& view){
328  return numberOfClasses(view.dataset());
329 }
330 
331 /// Return the input dimensionality of the labeled dataset represented by the view
332 template <class DatasetType>
333 std::size_t inputDimension(DataView<DatasetType> const& view){
334  return inputDimension(view.dataset());
335 }
336 /// Return the label dimensionality of the labeled dataset represented by the view
337 template <class DatasetType>
338 std::size_t labelDimension(DataView<DatasetType> const& view){
339  return labelDimension(view.dataset());
340 }
341 
342 /// Return the dimensionality of the dataset represented by the view
343 template <class DatasetType>
344 std::size_t dataDimension(DataView<DatasetType> const& view){
345  return dataDimension(view.dataset());
346 }
347 
348 
349 /** @*/
350 }
351 #endif