LinearSAGTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Generic Stochastic Average Gradient Descent training for linear models
6  *
7  *
8  *
9  *
10  * \author O. Krause
11  * \date 2016
12  *
13  *
14  * \par Copyright 1995-2017 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://shark-ml.org/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_LinearSAGTrainer_H
38 #define SHARK_ALGORITHMS_LinearSAGTrainer_H
39 
40 
46 #include <shark/Data/DataView.h>
47 
48 
49 namespace shark
50 {
51 
52 namespace detail{
53  template<class InputType, class LabelType>
54  struct LinearSAGTrainerBase{
55  typedef AbstractWeightedTrainer< LinearModel<InputType>,LabelType > type;
56  typedef AbstractLoss<LabelType,RealVector> LossType;
57  };
58  template<class InputType>
59  struct LinearSAGTrainerBase<InputType, unsigned int>{
60  typedef AbstractWeightedTrainer< LinearClassifier<InputType>, unsigned int > type;
61  typedef AbstractLoss<unsigned int,RealVector> LossType;
62  };
63 }
64 
65 
66 ///
67 /// \brief Stochastic Average Gradient Method for training of linear models,
68 ///
69 /// Given a differentiable loss function L(f, y) and a model f_j(x)= w_j^Tx+b
70 /// this trainer solves the regularized risk minimization problem
71 /// \f[
72 /// \min \frac{1}{2} \sum_j \frac{\lambda}{2}\|w_j\|^2 + \frac 1 {\ell} \sum_i L(y_i, f(x_i)),
73 /// \f]
74 /// where i runs over training data, j over the model outputs, and lambda > 0 is the
75 /// regularization parameter.
76 ///
77 /// The algorithm uses averaging of the algorithm to obtain a good estimate of the gradient.
78 /// Averaging is performed by summing over the last gradient value obtained for each data point.
79 /// At the beginning this estimate is far off as old gradient values are outdated, but as the
80 /// algorithm converges, this gives linear convergence on strictly convex functions
81 /// and O(1/T) convergence on not-strictly convex functions.
82 ///
83 /// The algorithm supports classification and regresseion, dense and sparse inputs
84 /// and weighted and unweighted datasets
85 /// Reference:
86 /// Schmidt, Mark, Nicolas Le Roux, and Francis Bach.
87 /// "Minimizing finite sums with the stochastic average gradient."
88 /// arXiv preprint arXiv:1309.2388 (2013).
89 template <class InputType, class LabelType>
90 class LinearSAGTrainer : public detail::LinearSAGTrainerBase<InputType,LabelType>::type, public IParameterizable<>
91 {
92 private:
94 public:
95  typedef typename Base::ModelType ModelType;
98 
99 
100  /// \brief Constructor
101  ///
102  /// \param loss (sub-)differentiable loss function
103  /// \param lambda regularization parameter fort wo-norm regularization, 0 by default
104  /// \param offset whether to train with offset/bias parameter or not, default is true
105  LinearSAGTrainer(LossType const* loss, double lambda = 0, bool offset = true)
106  : mep_loss(loss)
107  , m_lambda(lambda)
108  , m_offset(offset)
109  , m_maxEpochs(0)
110  { }
111 
112  /// \brief From INameable: return the class name.
113  std::string name() const
114  { return "LinearSAGTrainer"; }
115 
116  using Base::train;
117  void train(ModelType& model, WeightedDatasetType const& dataset){
118  trainImpl(random::globalRng, model,dataset,*mep_loss);
119  }
120 
121 
122  /// \briefReturn the number of training epochs.
123  /// A value of 0 indicates that the default of max(10, dimensionOfData) should be used.
124  std::size_t epochs() const
125  { return m_maxEpochs; }
126 
127  /// \brief Set the number of training epochs.
128  /// A value of 0 indicates that the default of max(10, dimensionOfData) should be used.
129  void setEpochs(std::size_t value)
130  { m_maxEpochs = value; }
131 
132 
133  /// \brief Return the value of the regularization parameter lambda.
134  double lambda() const
135  { return m_lambda; }
136 
137  /// \brief Set the value of the regularization parameter lambda.
138  void setLambda(double lambda)
139  { m_lambda = lambda; }
140 
141  /// \brief Check whether the model to be trained should include an offset term.
142  bool trainOffset() const
143  { return m_offset; }
144 
145  /// \brief Sets whether the model to be trained should include an offset term.
146  void setTrainOffset(bool offset)
147  { m_offset = offset;}
148 
149  /// \brief Returns the vector of hyper-parameters(same as lambda)
150  RealVector parameterVector() const
151  {
152  return RealVector(1,m_lambda);
153  }
154 
155  /// \brief Sets the vector of hyper-parameters(same as lambda)
156  void setParameterVector(RealVector const& newParameters)
157  {
158  SIZE_CHECK(newParameters.size() == 1);
159  m_lambda = newParameters(0);
160  }
161 
162  ///\brief Returns the number of hyper-parameters.
163  size_t numberOfParameters() const
164  {
165  return 1;
166  }
167 
168 private:
169  //initializes the model in the classification case and calls iterate to train it
170  void trainImpl(
171  random::rng_type& rng,
172  LinearClassifier<InputType>& classifier,
175  ){
176  //initialize model
177  std::size_t classes = numberOfClasses(dataset);
178  if(classes == 2) classes = 1;//special case: 2D classification is always encoded by the sign of the output
179  std::size_t dim = inputDimension(dataset);
180  auto& model = classifier.decisionFunction();
181  model.setStructure(dim,classes, m_offset);
182 
183  iterate(rng, model,dataset,loss);
184  }
185  //initializes the model in the regression case and calls iterate to train it
186  template<class LabelT>
187  void trainImpl(
188  random::rng_type& rng,
189  LinearModel<InputType>& model,
192  ){
193  //initialize model
194  std::size_t labelDim = labelDimension(dataset);
195  std::size_t dim = inputDimension(dataset);
196  model.setStructure(dim,labelDim, m_offset);
197  iterate(rng, model,dataset,loss);
198  }
199 
200  //dense vector case is easier, mostly implemented for simple exposition of the algorithm
201  template<class T>
202  void iterate(
203  random::rng_type& rng,
204  LinearModel<blas::vector<T> >& model,
205  WeightedLabeledData<blas::vector<T>, LabelType> const& dataset,
207  ){
208 
209  //get stats of the dataset
210  DataView<LabeledData<InputType, LabelType> const> data(dataset.data());
211  std::size_t ell = data.size();
212  std::size_t labelDim = model.outputShape().numElements();
213  std::size_t dim = model.inputShape().numElements();
214 
215  //set number of iterations
216  std::size_t iterations = m_maxEpochs * ell;
217  if(m_maxEpochs == 0)
218  iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
219 
220  //picking distribution picks proportional to weight
221  RealVector probabilities = createBatch(dataset.weights().elements());
222  probabilities /= sum(probabilities);
223  MultiNomialDistribution dist(probabilities);
224 
225  //variables used for the SAG loop
226  RealMatrix gradD(labelDim,ell,0); // gradients of regularized loss minimization with a linear model have the form sum_i D_i*x_i. We store the last acquired estimate
227  RealMatrix grad(labelDim,dim);// gradient of the weight matrix.
228  RealVector gradOffset(labelDim,0); //sum_i D_i, gradient estimate for the offset
229  RealVector pointNorms(ell); //norm of each point in the dataset
230  for(std::size_t i = 0; i != ell; ++i){
231  pointNorms(i) = norm_sqr(data[i].input);
232  }
233  // preinitialize everything to prevent costly memory allocations in the loop
234  RealVector f_b(labelDim, 0.0); // prediction of the model
235  RealVector derivative(labelDim, 0.0); //derivative of the loss
236  double L = 1; // initial estimate for the lipschitz-constant
237 
238  // SAG loop
239  for(std::size_t iter = 0; iter < iterations; iter++)
240  {
241  // choose data point
242  std::size_t b = dist(rng);
243 
244  // compute prediction
245  noalias(f_b) = prod(model.matrix(), data[b].input);
246  if(m_offset) noalias(f_b) += model.offset();
247 
248  // compute loss gradient
249  double currentValue = loss.evalDerivative(data[b].label, f_b, derivative);
250 
251  //update gradient (needs to be multiplied with kappa)
252  noalias(grad) += probabilities(b) * outer_prod(derivative-column(gradD,b), data[b].input);
253  if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
254  noalias(column(gradD,b)) = derivative; //we got a new estimate for D of element b.
255 
256  // update gradient
257  double eta = 1.0/(L+m_lambda);
258  noalias(model.matrix()) *= 1 - eta * m_lambda;//2-norm regularization
259  for(std::size_t i = 0; i != labelDim; ++i){
260  for(std::size_t j = 0; j != dim; ++j){
261  model.matrix()(i,j) -= eta*grad(i,j);
262  }
263  }
264  //~ noalias(model.matrix()) -= eta * grad;
265  if(m_offset) noalias(model.offset()) -= eta * gradOffset;
266 
267  //line-search procedure, 4.6 in the paper
268  noalias(f_b) -= derivative/L*pointNorms(b);
269  double newValue = loss.eval(data[b].label, f_b);
270  if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
271  L *= 2;
272  }
273  L*= std::pow(2.0,-1.0/ell);//allow L to slightly shrink in case our initial estimate was too large
274  }
275  }
276 
277  template<class T>
278  void iterate(
279  random::rng_type& rng,
280  LinearModel<blas::compressed_vector<T> >& model,
281  WeightedLabeledData<blas::compressed_vector<T>, LabelType> const& dataset,
283  ){
284 
285  //get stats of the dataset
286  DataView<LabeledData<InputType, LabelType> const> data(dataset.data());
287  std::size_t ell = data.size();
288  std::size_t labelDim = model.outputSize();
289  std::size_t dim = model.inputSize();
290 
291  //set number of iterations
292  std::size_t iterations = m_maxEpochs * ell;
293  if(m_maxEpochs == 0)
294  iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
295 
296  //picking distribution picks proportional to weight
297  RealVector probabilities = createBatch(dataset.weights().elements());
298  probabilities /= sum(probabilities);
299  MultiNomialDistribution dist(probabilities);
300 
301  //variables used for the SAG loop
302  blas::matrix<double,blas::column_major> gradD(labelDim,ell,0); // gradients of regularized loss minimization with a linear model have the form sum_i D_i*x_i. We store the last acquired estimate
303  RealMatrix grad(labelDim,dim);// gradient of the weight matrix.
304  RealVector gradOffset(labelDim,0); //sum_i D_i, gradient estimate for the offset
305  RealVector pointNorms(ell); //norm of each point in the dataset
306  for(std::size_t i = 0; i != ell; ++i){
307  pointNorms(i) = norm_sqr(data[i].input);
308  }
309  // preinitialize everything to prevent costly memory allocations in the loop
310  RealVector f_b(labelDim, 0.0); // prediction of the model
311  RealVector derivative(labelDim, 0.0); //derivative of the loss
312  double kappa =1; //we store the matrix as kappa*model.matrix() where kappa stores the effect of the 2-norm regularisation
313  double L = 1; // initial estimate for the lipschitz-constant
314 
315  //just in time updating for sparse inputs.
316  //we need to store a running sum of step lengths which we can then apply whenever an index got updated
317  RealVector appliedRates(dim,0.0);//applied steps since the last reset for each dimension
318  double stepsCumSum = 0.0;// cumulative update steps
319 
320 
321  // SAG loop
322  for(std::size_t iter = 0; iter < iterations; iter++)
323  {
324  // choose data point
325  std::size_t b = dist(rng);
326  auto const& point = data[b];
327 
328  //just in time update of the previous steps for every nonzero element of point b
329  auto end = point.input.end();
330  for(auto pos = point.input.begin(); pos != end; ++pos){
331  std::size_t index = pos.index();
332  noalias(column(model.matrix(),index)) -= (stepsCumSum - blas::repeat(appliedRates(index),labelDim))*column(grad,index);
333  appliedRates(index) = stepsCumSum;
334  }
335 
336  // compute prediction
337  noalias(f_b) = kappa * prod(model.matrix(), point.input);
338  if(m_offset) noalias(f_b) += model.offset();
339 
340  // compute loss gradient
341  double currentValue = loss.evalDerivative(point.label, f_b, derivative);
342 
343  //update gradient (needs to be multiplied with kappa)
344  //~ noalias(grad) += probabilities(b) * outer_prod(derivative-column(gradD,b), point.input); //todo: this is slow for some reason.
345  for(std::size_t l = 0; l != derivative.size(); ++l){
346  double val = probabilities(b) * (derivative(l) - gradD(l,b));
347  noalias(row(grad,l)) += val * point.input;
348  }
349 
350  if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
351  noalias(column(gradD,b)) = derivative; //we got a new estimate for D of element b.
352 
353  // update gradient
354  double eta = 1.0/(L+m_lambda);
355  stepsCumSum += kappa * eta;//we delay update of the matrix
356  if(m_offset) noalias(model.offset()) -= eta * gradOffset;
357  kappa *= 1 - eta * m_lambda;//2-norm regularization
358 
359  //line-search procedure, 4.6 in the paper
360  noalias(f_b) -= derivative/L*pointNorms(b);
361  double newValue = loss.eval(point.label, f_b);
362  if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
363  L *= 2;
364  }
365  L*= std::pow(2.0,-1.0/ell);//allow L to slightly shrink in case our initial estimate was too large
366 
367 
368  //every epoch we reset the internal variables for numeric stability and apply all outstanding updates
369  //this also ensures that in the last iteration all updates are applied (as iterations is a multiple of ell)
370  if((iter +1)% ell == 0){
371  noalias(model.matrix()) -= (stepsCumSum - blas::repeat(appliedRates,labelDim))*grad;
372  model.matrix() *= kappa;
373  kappa = 1;
374  stepsCumSum = 0.0;
375  appliedRates.clear();
376  }
377  }
378  }
379 
380  LossType const* mep_loss; ///< pointer to loss function
381  double m_lambda; ///< regularization parameter
382  bool m_offset; ///< should the resulting model have an offset term?
383  std::size_t m_maxEpochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
384 };
385 
386 }
387 #endif