42 #ifndef SHARK_DATA_HDF5_H 43 #define SHARK_DATA_HDF5_H 51 #include <boost/array.hpp> 52 #include <boost/format.hpp> 53 #include <boost/range/algorithm/fill.hpp> 54 #include <boost/range/algorithm/max_element.hpp> 55 #include <boost/smart_ptr/scoped_array.hpp> 56 #include <type_traits> 70 herr_t readHDF5Dataset( hid_t loc_id,
const char *dset_name,
int *buffer )
72 return H5LTread_dataset_int( loc_id, dset_name, buffer );
75 herr_t readHDF5Dataset( hid_t loc_id,
const char *dset_name,
long *buffer )
77 return H5LTread_dataset_long( loc_id, dset_name, buffer );
80 herr_t readHDF5Dataset( hid_t loc_id,
const char *dset_name,
float *buffer )
82 return H5LTread_dataset_float( loc_id, dset_name, buffer );
85 herr_t readHDF5Dataset( hid_t loc_id,
const char *dset_name,
double *buffer )
87 return H5LTread_dataset_double( loc_id, dset_name, buffer );
92 template<
typename RawValueType>
93 bool isSupported(H5T_class_t typeClass,
size_t typeSize)
95 if (H5T_FLOAT == typeClass && 8 == typeSize && std::is_floating_point < RawValueType > ::value
96 &&
sizeof(RawValueType) == 8) {
99 }
else if (H5T_FLOAT == typeClass && 4 == typeSize && std::is_floating_point < RawValueType > ::value
100 &&
sizeof(RawValueType) == 4) {
103 }
else if (H5T_INTEGER == typeClass && 4 == typeSize && std::is_integral < RawValueType > ::value
104 &&
sizeof(RawValueType) == 4) {
107 }
else if (H5T_INTEGER == typeClass && 8 == typeSize && std::is_integral < RawValueType > ::value
108 &&
sizeof(RawValueType) == 8) {
127 template<
typename MatrixType>
128 void loadIntoMatrix(
MatrixType& data,
const std::string& fileName,
const std::string& dataSetName)
130 typedef typename MatrixType::value_type
VectorType;
131 typedef typename VectorType::value_type RawValueType;
137 const size_t MAX_DIMENSIONS = 64u;
140 hid_t open = H5Fopen(fileName.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
142 throw SHARKEXCEPTION((boost::format(
"[loadIntoMatrix] open file name: %1% (FAILED)") % fileName).str());
144 const ScopedHandle<hid_t> fileId(
149 boost::array<hsize_t, MAX_DIMENSIONS> dims;
151 H5T_class_t typeClass;
154 H5LTget_dataset_info(*fileId, dataSetName.c_str(), dims.c_array(), &typeClass, &typeSize) >= 0,
155 (boost::format(
"Get data set(%1%) info from file(%2%).") % dataSetName % fileName).str());
164 "[%1%][%2%] Support 1 or 2 dimensions, but this dataset has at least 3 dimensions.") % fileName % dataSetName).str());
166 const hsize_t dim0 = dims[0];
167 const hsize_t dim1 = (0 == dims[1]) ? 1 : dims[1];
170 detail::isSupported<RawValueType>(typeClass, typeSize),
172 "DataType doesn't match. HDF5 data type in dataset(%3%::%4%): %1%, size: %2%")
176 % dataSetName).str());
179 const boost::scoped_array<RawValueType> dataBuffer(
new RawValueType[dim0 * dim1]);
180 SHARK_RUNTIME_CHECK(detail::readHDF5Dataset(*fileId, dataSetName.c_str(), dataBuffer.get()) >= 0,
" Read data set.");
183 for (
size_t i = 0; i < dim1; ++i) {
184 VectorType sample(dim0);
185 for (
size_t j = 0; j < dim0; ++j)
186 sample[j] = dataBuffer[i + j * dim1];
187 data.push_back(sample);
196 template<
typename MatrixType>
197 void loadHDF5Csc(
MatrixType& data,
const std::string& fileName,
const std::vector<std::string>& cscDatasetName)
199 typedef typename MatrixType::value_type
VectorType;
202 3 == cscDatasetName.size(),
203 "Must provide 3 dataset names for importing Compressed Sparse Column format.");
205 std::vector<VectorType> valBuf;
206 std::vector<std::vector<boost::int32_t> > indicesBuf;
207 std::vector<std::vector<boost::int32_t> > indexPtrBuf;
208 detail::loadIntoMatrix(valBuf, fileName, cscDatasetName[0]);
209 detail::loadIntoMatrix(indicesBuf, fileName, cscDatasetName[1]);
210 detail::loadIntoMatrix(indexPtrBuf, fileName, cscDatasetName[2]);
211 SHARK_RUNTIME_CHECK(1u == valBuf.size() && 1u == indicesBuf.size() && 1u == indexPtrBuf.size(),
"All datasets should be of one dimension.");
213 const VectorType& val = valBuf.front();
214 const std::vector<boost::int32_t>& indices = indicesBuf.front();
215 const std::vector<boost::int32_t>& indexPtr = indexPtrBuf.front();
216 SHARK_RUNTIME_CHECK(val.size() == indices.size(),
"Size of value and indices should be the same.");
217 SHARK_RUNTIME_CHECK(indexPtr.back() == (boost::int32_t)val.size(),
"Last element of index pointer should equal to size of value.");
220 const boost::uint32_t columnCount = indexPtr.size() - 1;
221 const boost::uint32_t rowCount = *boost::max_element(indices) + 1;
223 data.resize(columnCount);
227 for (
size_t i = 0; i < columnCount; ++i) {
228 for (boost::int32_t j = indexPtr[i]; j < indexPtr[i + 1]; ++j) {
229 data[i][indices[j]] = val[valIdx++];
242 template<
typename VectorType,
typename LabelType>
243 void constructLabeledData(
244 LabeledData<VectorType, LabelType>& labeledData,
245 const std::vector<VectorType>& dataBuffer,
246 const std::vector<std::vector<LabelType> >& labelBuffer)
249 1 == labelBuffer.size(),
250 (boost::format(
"Expect only one label vector, but get %1%.") % labelBuffer.size()).str());
252 dataBuffer.size() == labelBuffer.front().size(),
253 boost::format(
"Dimensions of data and label don't match.").str());
267 template<
typename VectorType>
270 const std::string& fileName,
271 const std::string& datasetName)
273 std::vector<VectorType> readinBuffer;
274 detail::loadIntoMatrix(readinBuffer, fileName, datasetName);
293 template<
typename VectorType,
typename LabelType>
296 const std::string& fileName,
297 const std::string& data,
298 const std::string& label)
300 std::vector<VectorType> readinData;
301 std::vector < std::vector<LabelType> > readinLabel;
303 detail::loadIntoMatrix(readinData, fileName, data);
304 detail::loadIntoMatrix(readinLabel, fileName, label);
305 detail::constructLabeledData(labeledData, readinData, readinLabel);
316 template<
typename VectorType>
319 const std::string& fileName,
320 const std::vector<std::string>& cscDatasetName)
322 std::vector<VectorType> readinBuffer;
323 detail::loadHDF5Csc(readinBuffer, fileName, cscDatasetName);
342 template<
typename VectorType,
typename LabelType>
345 const std::string& fileName,
346 const std::vector<std::string>& cscDatasetName,
347 const std::string& label)
349 std::vector<VectorType> readinData;
350 std::vector < std::vector<LabelType> > readinLabel;
352 detail::loadHDF5Csc(readinData, fileName, cscDatasetName);
353 detail::loadIntoMatrix(readinLabel, fileName, label);
354 detail::constructLabeledData(labeledData, readinData, readinLabel);
359 #endif // SHARK_DATA_HDF5_H