KernelBudgetedSGDTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Budgeted stochastic gradient descent training for kernel-based models.
6  *
7  * \par This is an implementation of the BSGD algorithm, developed by
8  * Wang, Crammer and Vucetic: Breaking the curse of kernelization:
9  * Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
10  * Basically this is pegasos, so something similar to a perceptron. The main
11  * difference is that we do restrict the sparsity of the weight vector to a (currently
12  * predefined) value. Therefore, whenever this sparsity is reached, we have to
13  * decide how to add a new vector to the model, without destroying this
14  * sparsity. Several methods have been proposed for this, Wang et al. main
15  * insight is that merging two budget vectors (i.e. two vectors in the model).
16  * If the first one is searched by norm of its alpha coefficient, the second one
17  * can be found by some optimization problem, yielding a roughly optimal pair.
18  * This pair can be merged and by doing so the budget has now space for a
19  * new vector. Such strategies are called budget maintenance strategies.
20  *
21  * \par This implementation owes much to the 'reference' implementation
22  * in the BudgetedSVM software.
23  *
24  *
25  * \author T. Glasmachers, Aydin Demircioglu
26  * \date 2014
27  *
28  *
29  * \par Copyright 1995-2017 Shark Development Team
30  *
31  * <BR><HR>
32  * This file is part of Shark.
33  * <http://shark-ml.org/>
34  *
35  * Shark is free software: you can redistribute it and/or modify
36  * it under the terms of the GNU Lesser General Public License as published
37  * by the Free Software Foundation, either version 3 of the License, or
38  * (at your option) any later version.
39  *
40  * Shark is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
43  * GNU Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public License
46  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
47  *
48  */
49 //===========================================================================
50 
51 
52 #ifndef SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
53 #define SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
54 
55 #include <iostream>
57 
66 
67 
68 namespace shark
69 {
70 
71 
72 ///
73 /// \brief Budgeted stochastic gradient descent training for kernel-based models.
74 ///
75 /// \par This is an implementation of the BSGD algorithm, developed by
76 /// Wang, Crammer and Vucetic: Breaking the curse of kernelization:
77 /// Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
78 /// Basically this is pegasos, so something similar to a perceptron. The main
79 /// difference is that we do restrict the sparsity of the weight vector to a (currently
80 /// predefined) value. Therefore, whenever this sparsity is reached, we have to
81 /// decide how to add a new vector to the model, without destroying this
82 /// sparsity. Several methods have been proposed for this, Wang et al. main
83 /// insight is that merging two budget vectors (i.e. two vectors in the model).
84 /// If the first one is searched by norm of its alpha coefficient, the second one
85 /// can be found by some optimization problem, yielding a roughly optimal pair.
86 /// This pair can be merged and by doing so the budget has now space for a
87 /// new vector. Such strategies are called budget maintenance strategies.
88 ///
89 /// \par This implementation owes much to the 'reference' implementation
90 /// in the BudgetedSVM software.
91 ///
92 /// \par For the documentation of the basic SGD algorithm, please refer to
93 /// KernelSGDTrainer.h. Note that we did not take over the special alpha scaling
94 /// from that class. Therefore this class is perhaps numerically not as robust as SGD.
95 ///
96 template <class InputType, class CacheType = float>
97 class KernelBudgetedSGDTrainer : public AbstractTrainer< KernelClassifier<InputType> >, public IParameterizable<>
98 {
99 public:
106  typedef CacheType QpFloatType;
108 
111 
112 
113 
114  /// preinitialization methods
115  enum preInitializationMethod {NONE, RANDOM}; // TODO: add KMEANS
116 
117 
118 
119  /// \brief Constructor
120  /// Note that there is no cache size involved, as merging vectors will always create new ones,
121  /// which makes caching roughly obsolete.
122  ///
123  /// \param[in] kernel kernel function to use for training and prediction
124  /// \param[in] loss (sub-)differentiable loss function
125  /// \param[in] C regularization parameter - always the 'true' value of C, even when unconstrained is set
126  /// \param[in] offset whether to train with offset/bias parameter or not
127  /// \param[in] unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
128  /// \param[in] budgetSize size of the budget/model that the final solution will have. Note that it might be smaller though.
129  /// \param[in] budgetMaintenanceStrategy object that contains the logic for maintaining the budget size.
130  /// \param[in] epochs number of epochs the SGD solver should run. if zero is given, the size will be the max of 10*datasetsize or C*datasetsize
131  /// \param[in] preInitializationMethod the method to preinitialize the budget.
132  /// \param[in] minMargin the margin every vector has to obey. Usually this is 1.
133  ///
135  KernelType* kernel,
136  const LossType* loss,
137  double C,
138  bool offset,
139  bool unconstrained = false,
140  size_t budgetSize = 500,
142  size_t epochs = 1,
143  size_t preInitializationMethod = NONE,
144  double minMargin = 1.0f
145  ): m_kernel(kernel)
146  , m_loss(loss)
147  , m_C(C)
148  , m_offset(offset)
149  , m_unconstrained(unconstrained)
152  , m_epochs(epochs)
155  {
156 
157  // check that the maintenance strategy is not null.
158  SHARK_RUNTIME_CHECK(m_budgetMaintenanceStrategy, "Budget maintenance strategy must not be NULL!");
159  SHARK_RUNTIME_CHECK(m_kernel, "Kernel must not be NULL!");
160  SHARK_RUNTIME_CHECK(m_loss, "Loss must not be NULL!");
161  }
162 
163 
164  /// get budget size
165  /// \return budget size
166  ///
167  size_t budgetSize() const
168  {
169  return m_budgetSize;
170  }
171 
172 
173  /// set budget size
174  /// \param[in] budgetSize size of budget.
175  ///
176  void setBudgetSize(std::size_t budgetSize)
177  {
179  }
180 
181 
182  /// return pointer to the budget maintenance strategy
183  /// \return pointer to the budget maintenance strategy.
184  ///
186  {
188  }
189 
190 
191  /// set budget maintenance strategy
192  /// \param[in] budgetMaintenanceStrategy set strategy to given object.
193  ///
195  {
197  }
198 
199 
200  /// return min margin
201  /// \return current min margin
202  ///
203  double minMargin() const
204  {
205  return m_minMargin;
206  }
207 
208 
209  /// set min margin
210  /// \param[in] minMargin new min margin.
211  ///
212  void setMinMargin(double minMargin)
213  {
215  }
216 
217 
218  /// \brief From INameable: return the class name.
219  std::string name() const
220  {
221  return "KernelBudgetedSGDTrainer";
222  }
223 
224 
225  /// Train routine.
226  /// \param[in] classifier classifier object for the final solution.
227  /// \param[in] dataset dataset to work with.
228  ///
229  void train(ClassifierType &classifier, const LabeledData<InputType, unsigned int> &dataset)
230  {
231 
232  std::size_t ell = dataset.numberOfElements();
233  std::size_t classes = numberOfClasses(dataset);
234 
235  // is the budget size larger than reasonable?
236  if(m_budgetSize > ell)
237  {
238  // in this case we just set the budgetSize to the given dataset size, so basically
239  // there is an infinite budget.
240  m_budgetSize = ell;
241  }
242 
243  // we always need one budget vector more than the user specified,
244  // as we first have to add any new vector to the budget before applying
245  // the maintenance strategy. an alternative would be to keep the budget size
246  // correct and test explicitely for the new support vector, but that would
247  // create even more hassle on the other side. or one could use a vector of
248  // budget vectors instead, but loose the nice framework of kernel expansions.
249  // so the last budget vector must always have zero alpha coefficients in
250  // the final model. (we do not check for that but roughly assume that in
251  // the strategies, e.g. by putting the new vector to the last position in the
252  // merge strategy).
254 
255  // easy access
256  UIntVector y = createBatch(dataset.labels().elements());
257 
258  // create a preinitialized budget.
259  // this is used to initialize the kernelexpansion, we will work with.
260  LabeledData<InputType, unsigned int> preinitializedBudgetVectors(m_budgetSize, dataset.element(0));
261 
262  // preinit the vectors first
263  // we still preinit even for no preinit, as we need the vectors in the
264  // constructor of the kernelexpansion. the alphas will be set to zero for none.
266  {
267  for(size_t j = 0; j < m_budgetSize; j++)
268  {
269  // choose a random vector
270  std::size_t b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
271 
272  // copy over the vector
273  preinitializedBudgetVectors.element(j) = dataset.element(b);
274  }
275  }
276 
277  /*
278  // TODO: kmeans initialization
279  if (m_preInitializationMethod == KMEANS) {
280  // the negative examples individually. the number of clusters should
281  // then follow the ratio of the classes. then we can set the alphas easily.
282  // TODO: do this multiclass
283  // TODO: maybe Kmedoid makes more sense because of the alphas.
284  // TODO: allow for different maxiters
285  Centroids centroids;
286  size_t maxIterations = 50;
287  kMeans (dataset.inputs(), m_budgetSize, centroids, maxIterations);
288 
289  // copy over to our budget
290  Data<RealVector> const& c = centroids.centroids();
291 
292  for (size_t j = 0; j < m_budgetSize; j++) {
293  preinitializedBudgetVectors.inputs().element (j) = c.element (j);
294  preinitializedBudgetVectors.labels().element (j) = 1; //FIXME
295  }
296  }
297  */
298 
299  // budget is a kernel expansion in its own right
300  ModelType &budgetModel = classifier.decisionFunction();
301  RealMatrix &budgetAlpha = budgetModel.alpha();
302  budgetModel.setStructure(m_kernel, preinitializedBudgetVectors.inputs(), m_offset, classes);
303 
304 
305  // variables
306  const double lambda = 1.0 / (ell * m_C);
307  std::size_t iterations;
308 
309 
310  // set epoch number
311  if(m_epochs == 0)
312  iterations = std::max(10 * ell, std::size_t (std::ceil(m_C * ell)));
313  else
314  iterations = m_epochs * ell;
315 
316 
317  // set the initial alphas (we do this here, after the array has been initialized by setStructure)
318  if(m_preInitializationMethod == RANDOM)
319  {
320  for(size_t j = 0; j < m_budgetSize; j++)
321  {
322  size_t c = preinitializedBudgetVectors.labels().element(j);
323  budgetAlpha(j, c) = 1 / (1 + lambda);
324  budgetAlpha(j, (c + 1) % classes) = -1 / (1 + lambda);
325  }
326  }
327 
328 
329  // whatever strategy we did use-- the last budget vector needs
330  // to be zeroed out, either it was zero anyway (none preinit)
331  // or it is the extra budget vector we need for technical reasons
332  row(budgetAlpha, m_budgetSize - 1) *= 0;
333 
334 
335  // preinitialize everything to prevent costly memory allocations in the loop
336  RealVector predictions(classes, 0.0);
337  RealVector derivative(classes, 0.0);
338 
339 
340  // SGD loop
341  std::size_t b = 0;
342 
343  for(std::size_t iter = 0; iter < iterations; iter++)
344  {
345  // active variable
346  b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
347 
348  // for smaller datasets instead of choosing randomly a sample
349  // permuting the dataset can be a valid strategy. We do not implement
350  // that here.
351 
352  // compute prediction within the budgeted model
353  // this will compute the predictions for all classes in one step
354  budgetModel.eval(dataset.inputs().element(b), predictions);
355 
356  // now we follow the crammer-singer model as written
357  // in paper (p. 11 top), we compute the scores of the true
358  // class and the runner-up class. for the latter we remove
359  // our true prediction temporarily and redo the argmax.
360 
361  RealVector predictionsCopy = predictions;
362  unsigned int trueClass = y[b];
363  double scoreOfTrueClass = predictions[trueClass];
364  predictions[trueClass] = -std::numeric_limits<double>::infinity();
365  unsigned int runnerupClass = (unsigned int)arg_max(predictions);
366  double scoreOfRunnerupClass = predictions[runnerupClass];
367 
368  SHARK_ASSERT(trueClass != runnerupClass);
369 
370  // scale alphas
371  budgetModel.alpha() *= ((long double)(1.0 - 1.0 / (iter + 1.0)));
372 
373  // check if there is a margin violation
374  if(scoreOfTrueClass - scoreOfRunnerupClass < m_minMargin)
375  {
376  // TODO: check if the current vector is already part of our budget
377 
378  // as we do not use the predictions anymore, we use them to push the new alpha values
379  // to the budgeted model
380  predictions.clear();
381 
382  // set the alpha values (see p 11, beta_t^{(i)} formula in wang, crammer, vucetic)
383  // alpha of true class
384  predictions[trueClass] = 1.0 / ((long double)(iter + 1.0) * lambda);
385 
386  // alpha of runnerup class
387  predictions[runnerupClass] = -1.0 / ((long double)(iter + 1.0) * lambda);
388 
389  m_budgetMaintenanceStrategy->addToModel(budgetModel, predictions, dataset.element(b));
390  }
391  }
392 
393  // finally we need to get rid of zero supportvectors.
394  budgetModel.sparsify();
395 
396  }
397 
398  /// Return the number of training epochs.
399  /// A value of 0 indicates that the default of max(10, C) should be used.
400  std::size_t epochs() const
401  {
402  return m_epochs;
403  }
404 
405  /// Set the number of training epochs.
406  /// A value of 0 indicates that the default of max(10, C) should be used.
407  void setEpochs(std::size_t value)
408  {
409  m_epochs = value;
410  }
411 
412  /// get the kernel function
413  KernelType *kernel()
414  {
415  return m_kernel;
416  }
417  /// get the kernel function
418  const KernelType *kernel() const
419  {
420  return m_kernel;
421  }
422  /// set the kernel function
423  void setKernel(KernelType *kernel)
424  {
425  m_kernel = kernel;
426  }
427 
428  /// check whether the parameter C is represented as log(C), thus,
429  /// in a form suitable for unconstrained optimization, in the
430  /// parameter vector
431  bool isUnconstrained() const
432  {
433  return m_unconstrained;
434  }
435 
436  /// return the value of the regularization parameter
437  double C() const
438  {
439  return m_C;
440  }
441 
442  /// set the value of the regularization parameter (must be positive)
443  void setC(double value)
444  {
445  RANGE_CHECK(value > 0.0);
446  m_C = value;
447  }
448 
449  /// check whether the model to be trained should include an offset term
450  bool trainOffset() const
451  {
452  return m_offset;
453  }
454 
455  ///\brief Returns the vector of hyper-parameters.
456  RealVector parameterVector() const {
457  if(m_unconstrained)
458  return m_kernel->parameterVector() | log(m_C);
459  else
460  return m_kernel->parameterVector() | m_C;
461  }
462 
463  ///\brief Sets the vector of hyper-parameters.
464  void setParameterVector(RealVector const &newParameters)
465  {
466  size_t kp = m_kernel->numberOfParameters();
467  SHARK_ASSERT(newParameters.size() == kp + 1);
468  m_kernel->setParameterVector(subrange(newParameters,0,kp));
469  m_C = newParameters.back();
470  if(m_unconstrained) m_C = exp(m_C);
471  }
472 
473  ///\brief Returns the number of hyper-parameters.
474  size_t numberOfParameters() const
475  {
476  return m_kernel->numberOfParameters() + 1;
477  }
478 
479 protected:
480  KernelType *m_kernel; ///< pointer to kernel function
481  const LossType *m_loss; ///< pointer to loss function
482  double m_C; ///< regularization parameter
483  bool m_offset; ///< should the resulting model have an offset term?
484  bool m_unconstrained; ///< should C be stored as log(C) as a parameter?
485 
486  // budget size
487  std::size_t m_budgetSize;
488 
489  // budget maintenance strategy
491 
492  std::size_t m_epochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
493 
494  // method to preinitialize budget
496 
497  // needed margin below which we update the model, also called beta sometimes
498  double m_minMargin;
499 };
500 
501 }
502 #endif