Download.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for downloading data sets from online sources.
6  *
7  *
8  * \par
9  * The methods in this file allow to download data sets from the
10  * mldata.org repository and other sources.
11  *
12  *
13  *
14  *
15  * \author T. Glasmachers
16  * \date 2016
17  *
18  *
19  * \par Copyright 1995-2017 Shark Development Team
20  *
21  * <BR><HR>
22  * This file is part of Shark.
23  * <http://shark-ml.org/>
24  *
25  * Shark is free software: you can redistribute it and/or modify
26  * it under the terms of the GNU Lesser General Public License as published
27  * by the Free Software Foundation, either version 3 of the License, or
28  * (at your option) any later version.
29  *
30  * Shark is distributed in the hope that it will be useful,
31  * but WITHOUT ANY WARRANTY; without even the implied warranty of
32  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33  * GNU Lesser General Public License for more details.
34  *
35  * You should have received a copy of the GNU Lesser General Public License
36  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
37  *
38  */
39 //===========================================================================
40 
41 #ifndef SHARK_DATA_DOWNLOAD_H
42 #define SHARK_DATA_DOWNLOAD_H
43 
44 #include <shark/Core/DLLSupport.h>
45 #include <shark/Data/Dataset.h>
46 #include <shark/Data/SparseData.h>
47 #include <shark/Data/Csv.h>
48 #include <sstream>
49 
50 namespace shark {
51 
52 /**
53  * \ingroup shark_globals
54  *
55  * @{
56  */
57 
58 
59 /// \brief Split a URL into its domain and resource parts.
60 ///
61 /// Returns a std::pair where the first element is the domain and the second is
62 /// the resource. With std::tie you can do pattern-matching:
63 /// std::tie(domain, resource) = splitUrl(url);
64 /// will fill the std::string variables domain and resource.
65 SHARK_EXPORT_SYMBOL std::pair<std::string, std::string> splitUrl(std::string const & url);
66 
67 /// \brief Download a document with the HTTP protocol.
68 ///
69 /// \param url download URL, for example "www.shark-ml.org/index.html"
70 /// \param port TCP/IP port, defaults to 80
71 ///
72 /// The function requests the document with a HTTP request and returns
73 /// the body of the corresponding HTTP reply. In case of success this
74 /// is the requested document. In case of an error the function throws
75 /// an exception. Note that the function does not perform standard
76 /// actions of web browsers, e.g., execute javascript or follow http
77 /// redirects. All HTTP response status codes other than 200 are
78 /// reported as failure to download the document and trigger an
79 /// exception.
80 SHARK_EXPORT_SYMBOL std::string download(std::string const& url, unsigned short port = 80);
81 
82 /// \brief Download and import a sparse data (libSVM) file.
83 ///
84 /// \param dataset container storing the loaded data
85 /// \param url http URL
86 /// \param port TCP/IP port, default is 80
87 /// \param highestIndex highest feature index, or 0 for auto-detection
88 /// \param batchSize size of batch
89 template <class InputType, class LabelType> void downloadSparseData(
91  std::string const& url,
92  unsigned short port = 80,
93  unsigned int highestIndex = 0,
95 )
96 {
97  std::string content = download(url, port);
98  std::stringstream ss(content);
99  importSparseData(dataset, ss, highestIndex, batchSize);
100 }
101 
102 
103 /// \brief Download a data set from mldata.org.
104 ///
105 /// \param dataset container storing the loaded data
106 /// \param name data set name
107 /// \param batchSize size of batch
108 template <class InputType, class LabelType> void downloadFromMLData(
110  std::string const& name,
112 )
113 {
114  std::string filename;
115  for (char c : name)
116  {
117  if (c == ' ') c = '-';
118  else if (c >= 'A' && c <= 'Z') c += 32;
119  else if (c == '[' || c == '(' || c == ')' || c == '.' || c == ']') continue;
120  filename += c;
121  }
122  downloadSparseData(dataset, "mldata.org/repository/data/download/libsvm/" + filename + "/", 80, 0, batchSize);
123 }
124 
125 
126 /// \brief Download and import a dense data (CSV) file for classification.
127 ///
128 /// \param dataset container storing the loaded data
129 /// \param url http URL
130 /// \param lp Position of the label in the record, either first or last column
131 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
132 /// \param comment Trailing character indicating comment line. By dfault it is '#'
133 /// \param port TCP/IP port, default is 80
134 /// \param maximumBatchSize size of batches in the dataset
135 template <class InputType> void downloadCsvData(
137  std::string const& url,
138  LabelPosition lp,
139  char separator = ',',
140  char comment = '#',
141  unsigned short port = 80,
142  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
143 )
144 {
145  std::string content = download(url, port);
146  csvStringToData(dataset, content, lp, separator, comment, maximumBatchSize);
147 }
148 
149 
150 /// \brief Download and import a dense data (CSV) file for regression.
151 ///
152 /// \param dataset container storing the loaded data
153 /// \param url http URL
154 /// \param lp Position of the label in the record, either first or last column
155 /// \param numberOfOutputs dimensionality of the labels
156 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
157 /// \param comment Trailing character indicating comment line. By dfault it is '#'
158 /// \param port TCP/IP port, default is 80
159 /// \param maximumBatchSize size of batches in the dataset
160 template <class InputType> void downloadCsvData(
162  std::string const& url,
163  LabelPosition lp,
164  std::size_t numberOfOutputs = 1,
165  char separator = ',',
166  char comment = '#',
167  unsigned short port = 80,
168  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
169 )
170 {
171  std::string content = download(url, port);
172  csvStringToData(dataset, content, lp, numberOfOutputs, separator, comment, maximumBatchSize);
173 }
174 
175 
176 /** @}*/
177 
178 }
179 #endif