Csv.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing and exporting data from and to character separated value (CSV) files
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from CSV files into Shark data
11  * containers.
12  *
13  *
14  *
15  *
16  * \author T. Voss, M. Tuma
17  * \date 2010
18  *
19  *
20  * \par Copyright 1995-2017 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://shark-ml.org/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 #ifndef SHARK_DATA_CSV_H
43 #define SHARK_DATA_CSV_H
44 
45 #include <shark/Core/DLLSupport.h>
46 #include <shark/Data/Dataset.h>
47 
48 #include <fstream>
49 #include <string>
50 
51 namespace shark {
52 
53 /**
54  * \ingroup shark_globals
55  *
56  * @{
57  */
58 
59 
60 /// \brief Position of the label in a CSV file
61 ///
62 /// \par
63 /// This type describes the position of the label in a record of a CSV file.
64 /// The label can be positioned either in the first or the last column, or
65 /// there can be no label present at all.
69 };
70 
71 namespace detail {
72 
73 // export function for unlabeled data
74 template<typename T, typename Stream>
75 void exportCSV(const T &data, // Container that holds the samples
76  Stream &out, // The file to be read from
77  char separator, // The separator between elements
78  bool scientific = true, //scientific notation?
79  unsigned int fieldwidth = 0
80 ) {
81  SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
82 
83  // set output format
84  if (scientific)
85  out.setf(std::ios_base::scientific);
86  std::streamsize ss = out.precision();
87  out.precision(10);
88 
89  // write out
90  typename T::const_iterator it = data.begin();
91  for (; it != data.end(); ++it) {
92  SHARK_RUNTIME_CHECK(it->begin() != it->end(), "Record must not be empty");
93  for (std::size_t i=0; i<(*it).size()-1; i++) {
94  out << std::setw(fieldwidth) << (*it)(i) << separator;
95  }
96  out << std::setw(fieldwidth) << (*it)((*it).size()-1) << std::endl;
97  }
98 
99  // restore output format
100  out.precision(ss);
101 }
102 
103  // export function for labeled data
104 
105 template<typename T, typename U, typename Stream>
106 void exportCSV_labeled(const T &input, // Container that holds the samples
107  const U &labels, // Container that holds the labels
108  Stream &out, // The file to be read from
109  LabelPosition lp, // The position of the label
110  char separator, // The separator between elements
111  bool scientific = true, //scientific notation?
112  unsigned int fieldwidth = 0, //column-align using this field width
113  typename boost::enable_if<
114  std::is_arithmetic<typename boost::range_value<U>::type>
115  >::type* dummy = 0//enable this only for arithmetic types
116 ) {
117  SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
118 
119  if (scientific)
120  out.setf(std::ios_base::scientific);
121  std::streamsize ss = out.precision();
122  out.precision(10);
123 
124  typename T::const_iterator iti = input.begin();
125  typename U::const_iterator itl = labels.begin();
126 
127 
128  for (; iti != input.end(); ++iti, ++itl) {
129  SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "Record must not be empty");
130  if (lp == FIRST_COLUMN)
131  out << *itl << separator;
132  for (std::size_t i=0; i<(*iti).size()-1; i++) {
133  out << std::setw(fieldwidth) << (*iti)(i) << separator;
134  }
135  if (lp == FIRST_COLUMN) {
136  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
137  } else {
138  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << separator << *itl << std::endl;
139  }
140  }
141  out.precision(ss);
142 }
143 
144 // export function for data with vector labels
145 template<typename T, typename U, typename Stream>
146 void exportCSV_labeled(
147  const T &input, // Container that holds the samples
148  const U &labels, // Container that holds the labels
149  Stream &out, // The file to be read from
150  LabelPosition lp, // The position of the label
151  char separator, // The separator between elements
152  bool scientific = true, //scientific notation?
153  unsigned int fieldwidth = 0, //column-align using this field width
154  typename boost::disable_if<
155  std::is_arithmetic<typename boost::range_value<U>::type>
156  >::type* dummy = 0//enable this only for complex types
157 ) {
158  SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
159 
160 
161  if (scientific)
162  out.setf(std::ios_base::scientific);
163  std::streamsize ss = out.precision();
164  out.precision(10);
165 
166  typename T::const_iterator iti = input.begin();
167  typename U::const_iterator itl = labels.begin();
168 
169  for (; iti != input.end(); ++iti, ++itl) {
170  SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
171  if (lp == FIRST_COLUMN) {
172  for (std::size_t j = 0; j < itl->size(); j++)
173  out << std::setw(fieldwidth) << (*itl)(j) << separator;
174  }
175  for (std::size_t i=0; i<(*iti).size()-1; i++) {
176  out << std::setw(fieldwidth) << (*iti)(i) << separator;
177  }
178  if (lp == FIRST_COLUMN) {
179  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
180  } else {
181  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1);
182  for (std::size_t j = 0; j < itl->size(); j++)
183  out << std::setw(fieldwidth) << separator << (*itl)(j);
184  out << std::endl;
185  }
186  }
187  out.precision(ss);
188 }
189 } // namespace detail
190 
191 
192 
193 // ACTUAL READ IN ROUTINES BELOW
194 
195 /// \brief Import unlabeled vectors from a read-in character-separated value file.
196 ///
197 /// \param data Container storing the loaded data
198 /// \param contents The read in csv-file
199 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
200 /// \param comment Trailing character indicating comment line. By dfault it is '#'
201 /// \param maximumBatchSize Size of batches in the dataset
203  Data<FloatVector> &data,
204  std::string const& contents,
205  char separator = ',',
206  char comment = '#',
207  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
208 );
209 
210 /// \brief Import unlabeled vectors from a read-in character-separated value file.
211 ///
212 /// \param data Container storing the loaded data
213 /// \param contents The read in csv-file
214 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
215 /// \param comment Trailing character indicating comment line. By dfault it is '#'
216 /// \param maximumBatchSize Size of batches in the dataset
218  Data<RealVector> &data,
219  std::string const& contents,
220  char separator = ',',
221  char comment = '#',
222  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
223 );
224 
225 /// \brief Import "csv" from string consisting only of a single unsigned int per row
226 ///
227 /// \param data Container storing the loaded data
228 /// \param contents The read in csv-file
229 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
230 /// \param comment Trailing characters indicating comment line. By default it is "#"
231 /// \param maximumBatchSize Size of batches in the dataset
233  Data<unsigned int> &data,
234  std::string const& contents,
235  char separator = ',',
236  char comment = '#',
237  std::size_t maximumBatchSize = Data<unsigned int>::DefaultBatchSize
238 );
239 
240 /// \brief Import "csv" from string consisting only of a single int per row
241 ///
242 /// \param data Container storing the loaded data
243 /// \param contents The read in csv-file
244 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
245 /// \param comment Trailing characters indicating comment line. By default it is "#"
246 /// \param maximumBatchSize Size of batches in the dataset
248  Data<int> &data,
249  std::string const& contents,
250  char separator = ',',
251  char comment = '#',
252  std::size_t maximumBatchSize = Data<int>::DefaultBatchSize
253 );
254 
255 /// \brief Import "csv" from string consisting only of a single double per row
256 ///
257 /// \param data Container storing the loaded data
258 /// \param contents The read in csv-file
259 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
260 /// \param comment Trailing characters indicating comment line. By default it is "#"
261 /// \param maximumBatchSize Size of batches in the dataset
263  Data<float> &data,
264  std::string const& contents,
265  char separator = ',',
266  char comment = '#',
267  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
268 );
269 
270 /// \brief Import "csv" from string consisting only of a single double per row
271 ///
272 /// \param data Container storing the loaded data
273 /// \param contents The read in csv-file
274 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
275 /// \param comment Trailing characters indicating comment line. By default it is "#"
276 /// \param maximumBatchSize Size of batches in the dataset
278  Data<double> &data,
279  std::string const& contents,
280  char separator = ',',
281  char comment = '#',
282  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
283 );
284 
285 /// \brief Import labeled data from a character-separated value file.
286 ///
287 /// \param dataset Container storing the loaded data
288 /// \param contents the read-in file contents.
289 /// \param lp Position of the label in the record, either first or last column
290 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
291 /// \param comment Character for indicating a comment, by default '#'
292 /// \param maximumBatchSize maximum size of a batch in the dataset after import
295  std::string const& contents,
296  LabelPosition lp,
297  char separator = ',',
298  char comment = '#',
299  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
300 );
301 
302 /// \brief Import labeled data from a character-separated value file.
303 ///
304 /// \param dataset Container storing the loaded data
305 /// \param contents the read-in file contents.
306 /// \param lp Position of the label in the record, either first or last column
307 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
308 /// \param comment Character for indicating a comment, by default '#'
309 /// \param maximumBatchSize maximum size of a batch in the dataset after import
312  std::string const& contents,
313  LabelPosition lp,
314  char separator = ',',
315  char comment = '#',
316  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
317 );
318 
319 
320 /// \brief Import regression data from a read-in character-separated value file.
321 ///
322 /// \param dataset Container storing the loaded data
323 /// \param contents The read in csv-file
324 /// \param lp Position of the label in the record, either first or last column
325 /// \param separator Separator between entries, typically a comma or a space
326 /// \param comment Character for indicating a comment, by default empty
327 /// \param numberOfOutputs Dimensionality of label/output
328 /// \param maximumBatchSize maximum size of a batch in the dataset after import
331  std::string const& contents,
332  LabelPosition lp,
333  std::size_t numberOfOutputs = 1,
334  char separator = ',',
335  char comment = '#',
336  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
337 );
338 
339 /// \brief Import regression data from a read-in character-separated value file.
340 ///
341 /// \param dataset Container storing the loaded data
342 /// \param contents The read in csv-file
343 /// \param lp Position of the label in the record, either first or last column
344 /// \param separator Separator between entries, typically a comma or a space
345 /// \param comment Character for indicating a comment, by default empty
346 /// \param numberOfOutputs Dimensionality of label/output
347 /// \param maximumBatchSize maximum size of a batch in the dataset after import
350  std::string const& contents,
351  LabelPosition lp,
352  std::size_t numberOfOutputs = 1,
353  char separator = ',',
354  char comment = '#',
355  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
356 );
357 
358 
359 
360 /// \brief Import a Dataset from a csv file
361 ///
362 /// \param data Container storing the loaded data
363 /// \param fn The file to be read from
364 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
365 /// \param comment Trailing character indicating comment line. By dfault it is '#'
366 /// \param maximumBatchSize Size of batches in the dataset
367 /// \param titleLines Specifies a number of lines to be skipped in the beginning of the file
368 template<class T>
370  Data<T>& data,
371  std::string fn,
372  char separator = ',',
373  char comment = '#',
374  std::size_t maximumBatchSize = Data<T>::DefaultBatchSize,
375  std::size_t titleLines = 0
376 ){
377  std::ifstream stream(fn.c_str());
378  SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
379 
380  stream.unsetf(std::ios::skipws);
381 
382  for(std::size_t i=0; i < titleLines; ++i) // ignoring the first lines
383  stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
384 
385  std::istream_iterator<char> streamBegin(stream);
386  std::string contents(//read contents of file in string
387  streamBegin,
388  std::istream_iterator<char>()
389  );
390  //call the actual parser
391  csvStringToData(data,contents,separator,comment,maximumBatchSize);
392 }
393 
394 /// \brief Import a labeled Dataset from a csv file
395 ///
396 /// \param data Container storing the loaded data
397 /// \param fn The file to be read from
398 /// \param lp Position of the label in the record, either first or last column
399 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
400 /// \param comment Trailing character indicating comment line. By dfault it is '#'
401 /// \param maximumBatchSize Size of batches in the dataset
402 template<class T>
404  LabeledData<blas::vector<T>, unsigned int>& data,
405  std::string fn,
406  LabelPosition lp,
407  char separator = ',',
408  char comment = '#',
409  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
410 ){
411  std::ifstream stream(fn.c_str());
412  SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
413 
414  stream.unsetf(std::ios::skipws);
415  std::istream_iterator<char> streamBegin(stream);
416  std::string contents(//read contents of file in string
417  streamBegin,
418  std::istream_iterator<char>()
419  );
420  //call the actual parser
421  csvStringToData(data,contents,lp,separator,comment,maximumBatchSize);
422 }
423 
424 /// \brief Import a labeled Dataset from a csv file
425 ///
426 /// \param data Container storing the loaded data
427 /// \param fn The file to be read from
428 /// \param lp Position of the label in the record, either first or last column
429 /// \param numberOfOutputs dimensionality of the labels
430 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
431 /// \param comment Trailing character indicating comment line. By dfault it is '#'
432 /// \param maximumBatchSize Size of batches in the dataset
433 template<class T>
435  LabeledData<blas::vector<T>, blas::vector<T> >& data,
436  std::string fn,
437  LabelPosition lp,
438  std::size_t numberOfOutputs = 1,
439  char separator = ',',
440  char comment = '#',
441  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
442 ){
443  std::ifstream stream(fn.c_str());
444  SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
445 
446  stream.unsetf(std::ios::skipws);
447  std::istream_iterator<char> streamBegin(stream);
448  std::string contents(//read contents of file in string
449  streamBegin,
450  std::istream_iterator<char>()
451  );
452  //call the actual parser
453  csvStringToData(data,contents,lp, numberOfOutputs, separator,comment,maximumBatchSize);
454 }
455 
456 /// \brief Format unlabeled data into a character-separated value file.
457 ///
458 /// \param set Container to be exported
459 /// \param fn The file to be written to
460 /// \param separator Separator between entries, typically a comma or a space
461 /// \param sci should the output be in scientific notation?
462 /// \param width argument to std::setw when writing the output
463 template<typename Type>
465  Data<Type> const& set,
466  std::string fn,
467  char separator = ',',
468  bool sci = true,
469  unsigned int width = 0
470 ) {
471  std::ofstream ofs(fn.c_str());
472  SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
473  detail::exportCSV(set.elements(), ofs, separator, sci, width);
474 }
475 
476 
477 /// \brief Format labeled data into a character-separated value file.
478 ///
479 /// \param dataset Container to be exported
480 /// \param fn The file to be written to
481 /// \param lp Position of the label in the record, either first or last column
482 /// \param separator Separator between entries, typically a comma or a space
483 /// \param sci should the output be in scientific notation?
484 /// \param width argument to std::setw when writing the output
485 template<typename InputType, typename LabelType>
487  LabeledData<InputType, LabelType> const &dataset,
488  std::string fn,
489  LabelPosition lp,
490  char separator = ',',
491  bool sci = true,
492  unsigned int width = 0
493 ) {
494  std::ofstream ofs(fn.c_str());
495  SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
496  detail::exportCSV_labeled(dataset.inputs().elements(), dataset.labels().elements(), ofs, lp, separator, sci, width);
497 }
498 
499 
500 /** @}*/
501 
502 } // namespace shark
503 #endif // SHARK_ML_CSV_H