35 #ifndef SHARK_DATA_CVDATASETTOOLS_H 36 #define SHARK_DATA_CVDATASETTOOLS_H 48 template<
class DatasetTypeT>
52 typedef typename DatasetType::IndexSet
IndexSet;
60 DatasetType
const &
set,
61 std::vector<IndexSet>
const &validationIndizes
62 ) : m_dataset(set),m_validationFolds(validationIndizes) {}
65 DatasetType
const &
set,
66 std::vector<std::size_t>
const &foldStart
68 for (std::size_t partition = 0; partition != foldStart.size(); partition++) {
69 std::size_t partitionSize = (partition+1 == foldStart.size()) ?
set.numberOfBatches() : foldStart[partition+1];
70 partitionSize -= foldStart[partition];
73 IndexSet validationIndizes(partitionSize);
74 for (std::size_t batch = 0; batch != partitionSize; ++batch) {
75 validationIndizes[batch]=batch+foldStart[partition];
77 m_validationFolds.push_back(validationIndizes);
93 return m_validationFolds[i];
98 IndexSet trainingFold;
99 detail::complement(m_validationFolds[i], m_dataset.numberOfBatches(), trainingFold);
105 return m_validationFolds.size();
119 DatasetType m_dataset;
120 std::vector<IndexSet> m_validationFolds;
121 std::size_t m_datasetSize;
122 std::vector<std::size_t> m_validationFoldSizes;
135 template<
class I,
class L>
138 std::size_t numberOfPartitions,
139 std::vector< std::vector<std::size_t> > members,
141 RecreationIndices * cv_indices = NULL
147 std::size_t numInputs =
set.numberOfElements();
148 std::size_t numClasses = members.size();
151 for (std::size_t c = 0; c != numClasses; c++) {
156 std::size_t nn = numInputs / numberOfPartitions;
157 std::size_t leftOver = numInputs % numberOfPartitions;
158 std::vector<std::size_t> validationSize(numberOfPartitions,nn);
159 for (std::size_t partition = 0; partition != leftOver; partition++) {
160 validationSize[partition]++;
164 std::vector<std::size_t> partitionStart;
165 std::vector<std::size_t> batchSizes;
166 std::size_t numBatches = batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
171 std::vector<std::size_t> validationSetStart = partitionStart;
173 std::size_t fold = 0;
174 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
177 if ( cv_indices != NULL ) {
178 cv_indices->first.clear();
179 cv_indices->first.resize( numInputs );
180 cv_indices->second.clear();
181 cv_indices->second.resize( numInputs );
185 for (std::size_t c = 0; c != numClasses; c++) {
186 for (std::size_t i = 0; i != members[c].size(); i++) {
187 std::size_t oldPos = members[c][i];
188 std::size_t batchNumber = validationSetStart[fold];
190 batchElements[fold].push_back(oldPos);
192 if ( cv_indices != NULL ) {
193 cv_indices->first[ j ] = oldPos;
194 cv_indices->second[ j ] = fold;
199 if (batchElements[fold].
size() == batchSizes[batchNumber]) {
200 newSet.
batch(validationSetStart[fold]) =
subBatch(setView,batchElements[fold]);
201 batchElements[fold].clear();
202 ++validationSetStart[fold];
205 fold = (fold+1) % numberOfPartitions;
239 template<
class I,
class L>
241 std::size_t numberOfPartitions,
243 std::vector<std::size_t> indices(
set.numberOfElements());
244 for (std::size_t i=0; i !=
set.numberOfElements(); i++)
260 template<
class I,
class L>
262 std::size_t numInputs =
set.numberOfElements();
265 std::vector<std::size_t> validationSize(numberOfPartitions);
266 std::size_t inputsForValidation = numInputs / numberOfPartitions;
267 std::size_t leftOver = numInputs - inputsForValidation * numberOfPartitions;
268 for (std::size_t i = 0; i != numberOfPartitions; i++) {
269 std::size_t vs=inputsForValidation+(i<leftOver);
270 validationSize[i] =vs;
274 std::vector<std::size_t> partitionStart;
275 std::vector<std::size_t> batchSizes;
276 detail::batchPartitioning(validationSize,partitionStart,batchSizes,
batchSize);
278 set.repartition(batchSizes);
300 std::size_t numberOfPartitions,
302 RecreationIndices * cv_indices = NULL
305 std::size_t numInputs = setView.
size();
310 std::vector< std::vector<std::size_t> > members(numClasses);
311 for (std::size_t i = 0; i != numInputs; i++) {
312 members[setView[i].label].push_back(i);
326 template<
class I,
class L>
329 std::size_t numberOfPartitions
331 std::vector<std::size_t> indizes(
set.numberOfBatches());
332 for(std::size_t i= 0; i !=
set.numberOfBatches(); ++i)
338 std::vector<IndexSet> folds;
339 std::size_t partitionSize =
set.numberOfBatches()/numberOfPartitions;
340 std::size_t remainder =
set.numberOfBatches() - partitionSize*numberOfPartitions;
341 std::vector<std::size_t>::iterator pos = indizes.begin();
342 for(std::size_t i = 0; i!= numberOfPartitions; ++i){
343 std::size_t
size = partitionSize;
348 folds.push_back(
IndexSet(pos,pos+size));
365 template<
class I,
class L>
368 std::size_t numberOfPartitions,
369 std::vector<std::size_t> indices,
372 std::size_t numInputs =
set.numberOfElements();
374 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.begin(),indices.end())+1);
377 std::vector<std::size_t> validationSize(numberOfPartitions,0);
378 for (std::size_t input = 0; input != numInputs; input++) {
379 validationSize[indices[input]]++;
383 std::vector<std::size_t> partitionStart;
384 std::vector<std::size_t> batchSizes;
385 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,
batchSize);
390 std::vector<std::size_t> validationSetStart = partitionStart;
391 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
392 for (std::size_t input = 0; input != numInputs; input++) {
393 std::size_t partition = indices[input];
394 batchElements[partition].push_back(input);
397 std::size_t batchNumber = validationSetStart[partition];
398 if (batchElements[partition].
size() == batchSizes[batchNumber]) {
399 newSet.
batch(validationSetStart[partition]) =
subBatch(setView,batchElements[partition]);
400 batchElements[partition].clear();
401 ++validationSetStart[partition];
426 template<
class I,
class L>
429 std::size_t numberOfPartitions,
430 RecreationIndices indices,
433 std::size_t numInputs =
set.numberOfElements();
434 SIZE_CHECK(indices.first.size() == numInputs);
435 SIZE_CHECK(indices.second.size() == numInputs);
436 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.second.begin(),indices.second.end())+1);
439 std::vector<std::size_t> validationSize(numberOfPartitions,0);
440 for (std::size_t input = 0; input != numInputs; input++) {
441 validationSize[indices.second[input]]++;
445 std::vector<std::size_t> partitionStart;
446 std::vector<std::size_t> batchSizes;
447 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,
batchSize);
452 std::vector<std::size_t> validationSetStart = partitionStart;
453 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
454 for (std::size_t input = 0; input != numInputs; input++) {
455 std::size_t partition = indices.second[input];
456 batchElements[partition].push_back( indices.first[input] );
459 std::size_t batchNumber = validationSetStart[partition];
460 if (batchElements[partition].
size() == batchSizes[batchNumber]) {
461 newSet.
batch(validationSetStart[partition]) =
subBatch(setView,batchElements[partition]);
462 batchElements[partition].clear();
463 ++validationSetStart[partition];