HDF5.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing data from HDF5 file
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from HDF5 files into Shark data
11  * containers.
12  *
13  *
14  *
15  *
16  * \author B. Li
17  * \date 2012
18  *
19  *
20  * \par Copyright 1995-2017 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://shark-ml.org/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 #ifndef SHARK_DATA_HDF5_H
43 #define SHARK_DATA_HDF5_H
44 
46 #include "shark/Data/Dataset.h"
47 
48 #include <hdf5.h> // This must come before #include <hdf5_hl.h>
49 #include <hdf5_hl.h>
50 
51 #include <boost/array.hpp>
52 #include <boost/format.hpp>
53 #include <boost/range/algorithm/fill.hpp>
54 #include <boost/range/algorithm/max_element.hpp>
55 #include <boost/smart_ptr/scoped_array.hpp>
56 #include <type_traits>
57 
58 namespace shark {
59 
60 namespace detail {
61 
62 /// Overload functions so that complier is able to automatically detect which function to call
63 /// @note
64 /// Basically there are two ways to add support for other data types:
65 /// (a) Use other corresponding API H5HTpublic.h if the type is supported(luckily)
66 /// (b) Use H5LTread_dataset() but need pass in the type_id which are listed at:
67 /// http://www.hdfgroup.org/HDF5/doc/RM/PredefDTypes.html
68 /// Need pay special attention to endian.
69 ///@{
70 herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, int *buffer )
71 {
72  return H5LTread_dataset_int( loc_id, dset_name, buffer );
73 }
74 
75 herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, long *buffer )
76 {
77  return H5LTread_dataset_long( loc_id, dset_name, buffer );
78 }
79 
80 herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, float *buffer )
81 {
82  return H5LTread_dataset_float( loc_id, dset_name, buffer );
83 }
84 
85 herr_t readHDF5Dataset( hid_t loc_id, const char *dset_name, double *buffer )
86 {
87  return H5LTread_dataset_double( loc_id, dset_name, buffer );
88 }
89 ///@}
90 
91 /// Check whether typeClass and typeSize are supported by current implementation
92 template<typename RawValueType>
93 bool isSupported(H5T_class_t typeClass, size_t typeSize)
94 {
95  if (H5T_FLOAT == typeClass && 8 == typeSize && std::is_floating_point < RawValueType > ::value
96  && sizeof(RawValueType) == 8) {
97  // double
98  return true;
99  } else if (H5T_FLOAT == typeClass && 4 == typeSize && std::is_floating_point < RawValueType > ::value
100  && sizeof(RawValueType) == 4) {
101  // float
102  return true;
103  } else if (H5T_INTEGER == typeClass && 4 == typeSize && std::is_integral < RawValueType > ::value
104  && sizeof(RawValueType) == 4) {
105  // int
106  return true;
107  } else if (H5T_INTEGER == typeClass && 8 == typeSize && std::is_integral < RawValueType > ::value
108  && sizeof(RawValueType) == 8) {
109  // long
110  return true;
111  }
112 
113  return false;
114 }
115 
116 /// @brief Load a dataset in a HDF5 file into a matrix
117 ///
118 /// @param data
119 /// in vector of vector format which should support assignment operations
120 /// @param fileName
121 /// The name of HDF5 file to be read from
122 /// @param dataSetName
123 /// the HDF5 dataset name to access in the HDF5 file
124 ///
125 /// @tparam MatrixType
126 /// The type of data container which will accept read-in data and should be a 2-dimension matrix
127 template<typename MatrixType>
128 void loadIntoMatrix(MatrixType& data, const std::string& fileName, const std::string& dataSetName)
129 {
130  typedef typename MatrixType::value_type VectorType; // e.g., std::vector<double>
131  typedef typename VectorType::value_type RawValueType; // e.g., double
132 
133  // Disable HDF5 diagnosis message which could be commented out in case of debugging HDF5 related issues
134  H5Eset_auto1(0, 0);
135 
136  // 64 is big enough for HDF5, which supports no more than 32 dimensions presently
137  const size_t MAX_DIMENSIONS = 64u;
138 
139  // Open the file, and then get dimension
140  hid_t open = H5Fopen(fileName.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
141  if(open < 0)
142  throw SHARKEXCEPTION((boost::format("[loadIntoMatrix] open file name: %1% (FAILED)") % fileName).str());
143 
144  const ScopedHandle<hid_t> fileId(
145  open,
146  H5Fclose
147  );
148 
149  boost::array<hsize_t, MAX_DIMENSIONS> dims;
150  dims.assign(0);
151  H5T_class_t typeClass;
152  size_t typeSize;
154  H5LTget_dataset_info(*fileId, dataSetName.c_str(), dims.c_array(), &typeClass, &typeSize) >= 0,
155  (boost::format("Get data set(%1%) info from file(%2%).") % dataSetName % fileName).str());
156 
157  if (0 == dims[0])
158  return;
159 
160  // Support 1 or 2 dimensions only at the moment
162  0 == dims[2],
163  (boost::format(
164  "[%1%][%2%] Support 1 or 2 dimensions, but this dataset has at least 3 dimensions.") % fileName % dataSetName).str());
165 
166  const hsize_t dim0 = dims[0];
167  const hsize_t dim1 = (0 == dims[1]) ? 1 : dims[1]; // treat one dimension as two-dimension of N x 1
168 
170  detail::isSupported<RawValueType>(typeClass, typeSize),
171  (boost::format(
172  "DataType doesn't match. HDF5 data type in dataset(%3%::%4%): %1%, size: %2%")
173  % typeClass
174  % typeSize
175  % fileName
176  % dataSetName).str());
177 
178  // Read data into a buffer
179  const boost::scoped_array<RawValueType> dataBuffer(new RawValueType[dim0 * dim1]);
180  SHARK_RUNTIME_CHECK(detail::readHDF5Dataset(*fileId, dataSetName.c_str(), dataBuffer.get()) >= 0, " Read data set.");
181 
182  // dims[0] = M, dims[1] = N, means each basic vector has M elements, and there are N of them.
183  for (size_t i = 0; i < dim1; ++i) {
184  VectorType sample(dim0);
185  for (size_t j = 0; j < dim0; ++j)
186  sample[j] = dataBuffer[i + j * dim1]; // elements in memory are in row-major order
187  data.push_back(sample);
188  }
189 }
190 
191 /// @brief load a matrix from HDF5 file in compressed sparse column format
192 ///
193 /// @param data the container which will hold the output matrix
194 /// @param fileName the name of HDF5 file
195 /// @param cscDatasetName dataset names for describing the CSC
196 template<typename MatrixType>
197 void loadHDF5Csc(MatrixType& data, const std::string& fileName, const std::vector<std::string>& cscDatasetName)
198 {
199  typedef typename MatrixType::value_type VectorType; // e.g., std::vector<double>
200 
202  3 == cscDatasetName.size(),
203  "Must provide 3 dataset names for importing Compressed Sparse Column format.");
204 
205  std::vector<VectorType> valBuf;
206  std::vector<std::vector<boost::int32_t> > indicesBuf;
207  std::vector<std::vector<boost::int32_t> > indexPtrBuf;
208  detail::loadIntoMatrix(valBuf, fileName, cscDatasetName[0]);
209  detail::loadIntoMatrix(indicesBuf, fileName, cscDatasetName[1]);
210  detail::loadIntoMatrix(indexPtrBuf, fileName, cscDatasetName[2]);
211  SHARK_RUNTIME_CHECK(1u == valBuf.size() && 1u == indicesBuf.size() && 1u == indexPtrBuf.size(), "All datasets should be of one dimension.");
212 
213  const VectorType& val = valBuf.front();
214  const std::vector<boost::int32_t>& indices = indicesBuf.front(); // WARNING: Not all indices are of int32 type
215  const std::vector<boost::int32_t>& indexPtr = indexPtrBuf.front();
216  SHARK_RUNTIME_CHECK(val.size() == indices.size(), "Size of value and indices should be the same.");
217  SHARK_RUNTIME_CHECK(indexPtr.back() == (boost::int32_t)val.size(), "Last element of index pointer should equal to size of value.");
218 
219  // Figure out dimensions of dense matrix
220  const boost::uint32_t columnCount = indexPtr.size() - 1; // the last one is place holder
221  const boost::uint32_t rowCount = *boost::max_element(indices) + 1; // max index plus 1
222 
223  data.resize(columnCount);
224  boost::fill(data, VectorType(rowCount, 0)); // pre-fill zero
225 
226  size_t valIdx = 0;
227  for (size_t i = 0; i < columnCount; ++i) {
228  for (boost::int32_t j = indexPtr[i]; j < indexPtr[i + 1]; ++j) {
229  data[i][indices[j]] = val[valIdx++];
230  }
231  }
232 }
233 
234 /// @brief Construct labeled data from passed in data and label
235 ///
236 /// @param labeledData
237 /// Container storing the loaded data
238 /// @param dataBuffer
239 /// The data container will hold
240 /// @param labelBuffer
241 /// The label for data inside @a dataBuffer
242 template<typename VectorType, typename LabelType>
243 void constructLabeledData(
244  LabeledData<VectorType, LabelType>& labeledData,
245  const std::vector<VectorType>& dataBuffer,
246  const std::vector<std::vector<LabelType> >& labelBuffer)
247 {
249  1 == labelBuffer.size(),
250  (boost::format("Expect only one label vector, but get %1%.") % labelBuffer.size()).str());
252  dataBuffer.size() == labelBuffer.front().size(),
253  boost::format("Dimensions of data and label don't match.").str());
254 
255  labeledData = createLabeledDataFromRange(dataBuffer, labelBuffer.front());
256 }
257 
258 } // namespace details
259 
260 /// @brief Import data from a HDF5 file.
261 ///
262 /// @param data Container storing the loaded data
263 /// @param fileName The name of HDF5 file to be read from
264 /// @param datasetName the HDF5 dataset name to access in the HDF5 file
265 ///
266 /// @tparam VectorType Type of object stored in Shark data container
267 template<typename VectorType>
269  Data<VectorType>& data,
270  const std::string& fileName,
271  const std::string& datasetName)
272 {
273  std::vector<VectorType> readinBuffer;
274  detail::loadIntoMatrix(readinBuffer, fileName, datasetName);
275  data = createDataFromRange(readinBuffer);
276 }
277 
278 /// @brief Import data to a LabeledData object from a HDF5 file.
279 ///
280 /// @param labeledData
281 /// Container storing the loaded data
282 /// @param fileName
283 /// The name of HDF5 file to be read from
284 /// @param data
285 /// the HDF5 dataset name for data
286 /// @param label
287 /// the HDF5 dataset name for label
288 ///
289 /// @tparam VectorType
290 /// Type of object stored in Shark data container
291 /// @tparam LableType
292 /// Type of label
293 template<typename VectorType, typename LabelType>
296  const std::string& fileName,
297  const std::string& data,
298  const std::string& label)
299 {
300  std::vector<VectorType> readinData;
301  std::vector < std::vector<LabelType> > readinLabel;
302 
303  detail::loadIntoMatrix(readinData, fileName, data);
304  detail::loadIntoMatrix(readinLabel, fileName, label);
305  detail::constructLabeledData(labeledData, readinData, readinLabel);
306 }
307 
308 /// @brief Import data from HDF5 dataset of compressed sparse column format.
309 ///
310 /// @param data Container storing the loaded data
311 /// @param fileName The name of HDF5 file to be read from
312 /// @param cscDatasetName
313 /// the CSC dataset names used to construct a matrix
314 ///
315 /// @tparam VectorType Type of object stored in Shark data container
316 template<typename VectorType>
318  Data<VectorType>& data,
319  const std::string& fileName,
320  const std::vector<std::string>& cscDatasetName)
321 {
322  std::vector<VectorType> readinBuffer;
323  detail::loadHDF5Csc(readinBuffer, fileName, cscDatasetName);
324  data = createDataFromRange(readinBuffer);
325 }
326 
327 /// @brief Import data from HDF5 dataset of compressed sparse column format.
328 ///
329 /// @param labeledData
330 /// Container storing the loaded data
331 /// @param fileName
332 /// The name of HDF5 file to be read from
333 /// @param cscDatasetName
334 /// the CSC dataset names used to construct a matrix
335 /// @param label
336 /// the HDF5 dataset name for label
337 ///
338 /// @tparam VectorType
339 /// Type of object stored in Shark data container
340 /// @tparam LabelType
341 /// Type of label
342 template<typename VectorType, typename LabelType>
345  const std::string& fileName,
346  const std::vector<std::string>& cscDatasetName,
347  const std::string& label)
348 {
349  std::vector<VectorType> readinData;
350  std::vector < std::vector<LabelType> > readinLabel;
351 
352  detail::loadHDF5Csc(readinData, fileName, cscDatasetName);
353  detail::loadIntoMatrix(readinLabel, fileName, label);
354  detail::constructLabeledData(labeledData, readinData, readinLabel);
355 }
356 
357 } // namespace shark {
358 
359 #endif // SHARK_DATA_HDF5_H