SparseData.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing and exporting data from and to sparse data (libSVM) formatted data files
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from LIBSVM files to Shark Data containers.
11  *
12  *
13  *
14  *
15  * \author M. Tuma, T. Glasmachers, C. Igel
16  * \date 2010-2016
17  *
18  *
19  * \par Copyright 1995-2017 Shark Development Team
20  *
21  * <BR><HR>
22  * This file is part of Shark.
23  * <http://shark-ml.org/>
24  *
25  * Shark is free software: you can redistribute it and/or modify
26  * it under the terms of the GNU Lesser General Public License as published
27  * by the Free Software Foundation, either version 3 of the License, or
28  * (at your option) any later version.
29  *
30  * Shark is distributed in the hope that it will be useful,
31  * but WITHOUT ANY WARRANTY; without even the implied warranty of
32  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33  * GNU Lesser General Public License for more details.
34  *
35  * You should have received a copy of the GNU Lesser General Public License
36  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
37  *
38  */
39 //===========================================================================
40 
41 #ifndef SHARK_DATA_SPARSEDATA_H
42 #define SHARK_DATA_SPARSEDATA_H
43 
44 #include <shark/Core/DLLSupport.h>
46 #include <shark/Data/Dataset.h>
47 #include <fstream>
48 
49 namespace shark {
50 
51 /**
52  * \ingroup shark_globals
53  *
54  * @{
55  */
56 
57 
58 
59 /// \brief Import classification data from a sparse data (libSVM) file.
60 ///
61 /// \param dataset container storing the loaded data
62 /// \param stream stream to be read from
63 /// \param highestIndex highest feature index, or 0 for auto-detection
64 /// \param batchSize size of batch
66  LabeledData<RealVector, unsigned int>& dataset,
67  std::istream& stream,
68  unsigned int highestIndex = 0,
70 );
71 
72 /// \brief Import regression data from a sparse data (libSVM) file.
73 ///
74 /// \param dataset container storing the loaded data
75 /// \param stream stream to be read from
76 /// \param highestIndex highest feature index, or 0 for auto-detection
77 /// \param batchSize size of batch
79  LabeledData<RealVector, RealVector>& dataset,
80  std::istream& stream,
81  unsigned int highestIndex = 0,
83 );
84 
85 /// \brief Import classification data from a sparse data (libSVM) file.
86 ///
87 /// \param dataset container storing the loaded data
88 /// \param stream stream to be read from
89 /// \param highestIndex highest feature index, or 0 for auto-detection
90 /// \param batchSize size of batch
92  LabeledData<CompressedRealVector, unsigned int>& dataset,
93  std::istream& stream,
94  unsigned int highestIndex = 0,
96 );
97 
98 /// \brief Import regression data from a sparse data (libSVM) file.
99 ///
100 /// \param dataset container storing the loaded data
101 /// \param stream stream to be read from
102 /// \param highestIndex highest feature index, or 0 for auto-detection
103 /// \param batchSize size of batch
105  LabeledData<CompressedRealVector, RealVector>& dataset,
106  std::istream& stream,
107  unsigned int highestIndex = 0,
109 );
110 
111 /// \brief Import classification data from a sparse data (libSVM) file.
112 ///
113 /// \param dataset container storing the loaded data
114 /// \param fn the file to be read from
115 /// \param highestIndex highest feature index, or 0 for auto-detection
116 /// \param batchSize size of batch
118  LabeledData<RealVector, unsigned int>& dataset,
119  std::string fn,
120  unsigned int highestIndex = 0,
122 );
123 
124 /// \brief Import regression data from a sparse data (libSVM) file.
125 ///
126 /// \param dataset container storing the loaded data
127 /// \param fn the file to be read from
128 /// \param highestIndex highest feature index, or 0 for auto-detection
129 /// \param batchSize size of batch
131  LabeledData<RealVector, RealVector>& dataset,
132  std::string fn,
133  unsigned int highestIndex = 0,
135 );
136 
137 /// \brief Import classification data from a sparse data (libSVM) file.
138 ///
139 /// \param dataset container storing the loaded data
140 /// \param fn the file to be read from
141 /// \param highestIndex highest feature index, or 0 for auto-detection
142 /// \param batchSize size of batch
144  LabeledData<CompressedRealVector, unsigned int>& dataset,
145  std::string fn,
146  unsigned int highestIndex = 0,
148 );
149 
150 /// \brief Import regression data from a sparse data (libSVM) file.
151 ///
152 /// \param dataset container storing the loaded data
153 /// \param fn the file to be read from
154 /// \param highestIndex highest feature index, or 0 for auto-detection
155 /// \param batchSize size of batch
157  LabeledData<CompressedRealVector, RealVector>& dataset,
158  std::string fn,
159  unsigned int highestIndex = 0,
161 );
162 
163 
164 /// \brief Export classification data to sparse data (libSVM) format.
165 ///
166 /// \param dataset Container storing the data
167 /// \param stream Output stream
168 /// \param oneMinusOne Flag for applying the transformation y<-2y-1 to binary labels
169 /// \param sortLabels Flag for sorting data points according to labels
170 template<typename InputType>
171 void exportSparseData(LabeledData<InputType, unsigned int> const& dataset, std::ostream& stream, bool oneMinusOne = true, bool sortLabels = false)
172 {
173  if (numberOfClasses(dataset) != 2) oneMinusOne = false;
174 
175  std::vector< KeyValuePair<unsigned int, std::pair<std::size_t, std::size_t> > > order;
176  for (std::size_t b=0; b<dataset.numberOfBatches(); b++)
177  {
178  auto batch = dataset.batch(b);
179  for (std::size_t i=0; i<batchSize(batch); i++)
180  {
181  order.emplace_back(getBatchElement(batch, i).label, std::make_pair(b, i));
182  }
183  }
184  if (sortLabels)
185  {
186  std::sort(order.begin(), order.end());
187  }
188 
189  for (auto const& p : order)
190  {
191  auto element = getBatchElement(dataset.batch(p.value.first), p.value.second);
192  // apply transformation to label and write it to file
193  if (oneMinusOne) stream << 2*int(element.label)-1 << " ";
194  //libsvm file format documentation is scarce, but by convention the first class seems to be 1..
195  else stream << element.label+1 << " ";
196  // write input data to file
197  for (auto it = element.input.begin(); it != element.input.end(); ++it)
198  {
199  stream << " " << it.index()+1 << ":" << *it;
200  }
201  stream << std::endl;
202  }
203 }
204 
205 /// \brief Export classification data to sparse data (libSVM) format.
206 ///
207 /// \param dataset Container storing the data
208 /// \param fn Output file name
209 /// \param oneMinusOne Flag for applying the transformation y<-2y-1 to binary labels
210 /// \param sortLabels Flag for sorting data points according to labels
211 /// \param append Flag for appending to the output file instead of overwriting it
212 template<typename InputType>
213 void exportSparseData(LabeledData<InputType, unsigned int> const& dataset, const std::string &fn, bool oneMinusOne = true, bool sortLabels = false, bool append = false)
214 {
215  std::ofstream ofs;
216 
217  // shall we append only or overwrite?
218  if (append == true) {
219  ofs.open (fn.c_str(), std::fstream::out | std::fstream::app );
220  } else {
221  ofs.open (fn.c_str());
222  }
223  SHARK_RUNTIME_CHECK(ofs, "File can not be opened for writing");
224 
225  exportSparseData(dataset, ofs, oneMinusOne, sortLabels);
226 }
227 
228 /// \brief Export regression data to sparse data (libSVM) format.
229 ///
230 /// \param dataset Container storing the data
231 /// \param stream Output stream
232 template<typename InputType>
233 void exportSparseData(LabeledData<InputType, RealVector> const& dataset, std::ostream& stream)
234 {
235  for (std::size_t b=0; b<dataset.numberOfBatches(); b++)
236  {
237  auto batch = dataset.batch(b);
238  for (std::size_t i=0; i<batchSize(batch); i++)
239  {
240  auto element = getBatchElement(batch, i);
241  SHARK_ASSERT(element.label.size() == 1);
242  stream << element.label(0);
243  for (auto it = element.input.begin(); it != element.input.end(); ++it)
244  {
245  stream << " " << it.index()+1 << ":" << *it;
246  }
247  stream << std::endl;
248  }
249  }
250 }
251 
252 /// \brief Export regression data to sparse data (libSVM) format.
253 ///
254 /// \param dataset Container storing the data
255 /// \param fn Output file
256 /// \param append Flag for appending to the output file instead of overwriting it
257 template<typename InputType>
258 void exportSparseData(LabeledData<InputType, RealVector> const& dataset, const std::string &fn, bool append = false)
259 {
260  std::ofstream ofs;
261 
262  // shall we append only or overwrite?
263  if (append == true) {
264  ofs.open (fn.c_str(), std::fstream::out | std::fstream::app );
265  } else {
266  ofs.open (fn.c_str());
267  }
268 
269  SHARK_RUNTIME_CHECK(ofs, "File can not be opened for writing");
270 
271  exportSparseData(dataset, ofs);
272 }
273 
274 /** @}*/
275 
276 }
277 #endif