45 #ifndef SHARK_DATA_DATASET_H 46 #define SHARK_DATA_DATASET_H 48 #include <boost/range/iterator_range.hpp> 53 #include <boost/iterator/transform_iterator.hpp> 56 #include "Impl/Dataset.inl" 127 template <
class Type>
141 typedef typename Container::BatchType batch_type;
145 typedef Type element_type;
153 return (m_data == rhs.
m_data);
158 return (! (*
this == rhs));
164 typedef boost::iterator_range< detail::DataElementIterator<Data<Type> > >
element_range;
176 detail::DataElementIterator<
Data<Type> const>(
this,0,0,0),
186 detail::DataElementIterator<
Data<Type> >(
this,0,0,0),
208 return m_data.size();
212 return m_data.numberOfElements();
228 return m_data.empty();
233 return *(detail::DataElementIterator<Data<Type> >(
this,0,0,0)+i);
235 const_element_reference
element(std::size_t i)
const{
236 return *(detail::DataElementIterator<Data<Type>
const>(
this,0,0,0)+i);
240 batch_reference
batch(std::size_t i){
241 return *(m_data.begin()+i);
243 const_batch_reference
batch(std::size_t i)
const{
244 return *(m_data.begin()+i);
253 explicit Data(std::size_t numBatches) : m_data( numBatches )
280 m_data.makeIndependent();
287 m_data.splitBatch(m_data.begin()+
batch,elementIndex);
296 right.
m_data = m_data.splice(m_data.begin()+
batch);
306 m_data.append(other.
m_data);
310 m_data.push_back(batch);
317 template<
class Range>
319 m_data.repartition(batchSizes);
327 return m_data.getPartitioning();
335 detail::complement(indices,m_data.size(),comp);
360 std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
361 for(
auto elem:d.elements())
362 stream << elem <<
"\n";
372 template <
class InputT>
376 typedef InputT element_type;
382 using base_type::m_data;
409 : base_type(numBatches)
414 :base_type(container,batchSizes){}
466 template <
class InputT,
class LabelT>
476 static const std::size_t DefaultBatchSize = InputContainer::DefaultBatchSize;
479 typedef InputLabelBatch<
484 typedef InputLabelPair<InputType,LabelType> element_type;
487 typedef InputLabelBatch<
488 typename Batch<InputType>::type&,
491 typedef InputLabelBatch<
492 typename Batch<InputType>::type
const&,
499 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> > >
element_range;
500 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType>
const> >
const_element_range;
501 typedef detail::BatchRange<LabeledData<InputType,LabelType> >
batch_range;
543 return m_data.numberOfBatches();
547 return m_data.numberOfElements();
583 :
m_data(numBatches),m_label(numBatches)
602 :
m_data(inputs), m_label(labels)
613 return *(detail::DataElementIterator<LabeledData<InputType,LabelType> >(
this,0,0,0)+i);
615 const_element_reference
element(std::size_t i)
const{
616 return *(detail::DataElementIterator<LabeledData<InputType,LabelType>
const>(
this,0,0,0)+i);
639 return m_label.shape();
644 return m_label.shape();
663 m_label.makeIndependent();
673 m_data.splitBatch(batch,elementIndex);
674 m_label.splitBatch(batch,elementIndex);
695 typename Batch<InputType>::type
const& inputs,
699 m_label.push_back(labels);
713 template<
class Range>
715 m_data.repartition(batchSizes);
716 m_label.repartition(batchSizes);
724 return m_data.getPartitioning();
737 return LabeledData(
m_data.indexedSubset(indices),m_label.indexedSubset(indices));
753 template<
class Functor,
class T>
761 static Shape infer(T
const&){
return {};}
765 struct InferShape<Data<blas::vector<T> > >{
766 static Shape infer(Data<blas::vector<T> >
const& f){
767 return {f.element(0).
size()};
772 struct InferShape<Data<blas::compressed_vector<T> > >{
773 static Shape infer(Data<blas::compressed_vector<T> >
const& f){
774 return {f.element(0).
size()};
786 template<
class Range>
789 typedef typename Range::value_type Input;
791 if (maximumBatchSize == 0)
794 std::size_t numPoints = inputs.size();
796 std::size_t
batches = numPoints / maximumBatchSize;
797 if(numPoints > batches*maximumBatchSize)
799 std::size_t optimalBatchSize=numPoints/
batches;
800 std::size_t remainder = numPoints-batches*optimalBatchSize;
804 auto start= inputs.begin();
805 for(std::size_t i = 0; i !=
batches; ++i){
806 std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
807 auto end = start+size;
808 data.
batch(i) = createBatch<Input>(
809 boost::make_iterator_range(start,end)
813 data.
shape() = detail::InferShape<Data<Input> >::infer(data);
818 template<
class Range>
824 template<
class Range1,
class Range2>
826 typename boost::range_value<Range1>::type,
827 typename boost::range_value<Range2>::type
830 SHARK_RUNTIME_CHECK(inputs.size() == labels.size(),
"Number of inputs and number of labels must agree");
831 typedef typename boost::range_value<Range1>::type Input;
832 typedef typename boost::range_value<Range2>::type Label;
834 if (maximumBatchSize == 0)
844 template<
class T,
class U>
845 std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
846 for(
auto elem: d.elements())
847 stream << elem.input <<
" [" << elem.label <<
"]"<<
"\n";
857 unsigned int classes = 0;
859 classes = std::max(classes,*std::max_element(labels.
batch(i).begin(),labels.
batch(i).end()));
868 for(
unsigned int elem: labels.
batch(i)){
876 template <
class InputType>
879 return dataset.
element(0).size();
883 template <
class InputType,
class LabelType>
889 template <
class InputType,
class LabelType>
894 template <
class InputType>
899 template<
class InputType,
class LabelType>
908 template<
class T,
class Functor>
909 typename boost::lazy_disable_if<
914 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
918 result.
batch(i)= createBatch<ResultType>(
922 result.
shape() = detail::InferShape<Data<ResultType> >::infer(result);
929 template<
class T,
class Functor>
930 typename boost::lazy_enable_if<
931 CanBeCalled<Functor,typename Data<T>::batch_type>,
935 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
940 Shape shape = detail::InferShape<Functor>::infer(f);
941 if(shape ==
Shape()){
942 shape = detail::InferShape<Data<ResultType> >::infer(result);
949 template<
class I,
class L,
class Functor>
956 template<
class I,
class L,
class Functor>
964 template<
class T,
class FeatureSet>
966 auto select = [&](blas::matrix<T>
const& input){
967 blas::matrix<T> output(input.size1(),features.size());
968 for(std::size_t i = 0; i != input.size1(); ++i){
969 for(std::size_t j = 0; j != features.size(); ++j){
970 output(i,j) = input(i,features[j]);
978 template<
class T,
class FeatureSet>
991 template<
class DatasetT>
993 SIZE_CHECK(elementIndex<=data.numberOfElements());
995 std::size_t batchPos = 0;
996 std::size_t batchStart = 0;
997 while(batchStart +
batchSize(data.batch(batchPos)) < elementIndex){
998 batchStart +=
batchSize(data.batch(batchPos));
1001 std::size_t splitPoint = elementIndex-batchStart;
1002 if(splitPoint != 0){
1003 data.splitBatch(batchPos,splitPoint);
1007 return data.splice(batchPos);
1017 std::vector<std::size_t > classCounts =
classSizes(data);
1018 std::vector<std::size_t > partitioning;
1019 std::vector<std::size_t > classStart;
1020 detail::batchPartitioning(classCounts, classStart, partitioning,
batchSize);
1025 std::vector<std::size_t> bat = classStart;
1026 std::vector<std::size_t> idx(classStart.size(), 0);
1030 if(b == classStart[c+1]) ++c;
1034 unsigned int label =
batch.label[e];
1038 if (e ==
batch.size())
1047 auto&& batch2 = data.
batch(bat[label]);
1051 if (idx[label] == batch2.size())
1064 unsigned int zeroClass,
1065 unsigned int oneClass
1067 std::vector<std::size_t> indexSet;
1068 std::size_t smaller = std::min(zeroClass,oneClass);
1069 std::size_t bigger = std::max(zeroClass,oneClass);
1073 std::size_t start= 0;
1079 indexSet.push_back(start);
1088 indexSet.push_back(start);
1104 unsigned int oneClass
1106 return transformLabels(data, [=](
unsigned int label){
return (
unsigned int)(label == oneClass);});
1109 template <
typename RowType>
1113 std::size_t rowCounter = 0;
1115 column(rowCounter) =
element(columnID);
1121 template <
typename RowType>
1125 std::size_t rowCounter = 0;
1127 element(columnID) = newColumn(rowCounter);