Pegasos.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Pegasos solvers for linear SVMs
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2012
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_PEGASOS_H
37 #define SHARK_ALGORITHMS_PEGASOS_H
38 
39 
40 #include <shark/LinAlg/Base.h>
41 #include <shark/Data/Dataset.h>
42 #include <shark/Core/Random.h>
43 #include <cmath>
44 #include <iostream>
45 
46 
47 namespace shark {
48 
49 
50 ///
51 /// \brief Pegasos solver for linear (binary) support vector machines.
52 ///
53 template <class VectorType>
54 class Pegasos
55 {
56 public:
57  /// \brief Solve the primal SVM problem.
58  ///
59  /// In addition to "standard" Pegasos this solver checks a
60  /// meaningful stopping criterion.
61  ///
62  /// The function returns the number of model predictions
63  /// during training (this is comparable to SMO iterations).
64  template <class WeightType>
65  static std::size_t solve(
66  LabeledData<VectorType, unsigned int> const& data, ///< training data
67  double C, ///< SVM regularization parameter
68  WeightType& w, ///< weight vector
69  std::size_t batchsize = 1, ///< number of samples in each gradient estimate
70  double varepsilon = 0.001) ///< solution accuracy (factor by which the primal gradient should be reduced)
71  {
72  std::size_t ell = data.numberOfElements();
73  double lambda = 1.0 / (ell * C);
74  SHARK_ASSERT(batchsize > 0);
75 
76  double initialPrimal = 1.0;
77  double normbound2 = initialPrimal / lambda; // upper bound for |sigma * w|^2
78  double norm_w2 = 0.0; // squared norm of w
79  double sigma = 1.0; // scaling factor for w
80  VectorType gradient(w.size()); // gradient (to be computed in each iteration)
81  w = RealZeroVector(w.size()); // clear does not work on matrix rows (ublas sucks!)
82 
83  // pegasos main loop
84  std::size_t start = 10;
85  std::size_t checkinterval = (2 * ell) / batchsize;
86  std::size_t nextcheck = start + ell / batchsize;
87  std::size_t predictions = 0;
88  for (std::size_t t=start; ; t++)
89  {
90  // check the stopping criterion: \|gradient\| < epsilon ?
91  if (t >= nextcheck)
92  {
93  // compute the gradient
94  gradient = (lambda * sigma * (double)ell) * w;
95  for (std::size_t i=0; i<ell; i++)
96  {
97  VectorType const& x = data(i).input;
98  unsigned int y = data(i).label;
99  double f = sigma * inner_prod(w, x);
100  lg(x, y, f, gradient);
101  }
102  predictions += ell;
103 
104  // compute the norm of the gradient
105  double n2 = inner_prod(gradient, gradient);
106  double n = std::sqrt(n2) / (double)ell;
107 
108  // check the stopping criterion
109  if (n < varepsilon)
110  {
111 // std::cout << "target accuracy reached." << std::endl;
112 // std::cout << "accuracy: " << n << std::endl;
113 // std::cout << "predictions: " << predictions << std::endl;
114  break;
115  }
116 
117  nextcheck = t + checkinterval;
118  }
119 
120  // compute the gradient
121  gradient.clear();
122  bool nonzero = true;
123  for (unsigned int i=0; i<batchsize; i++)
124  {
125  // select the active variable (sample with replacement)
126  std::size_t active = random::discrete(0, ell-1);
127  VectorType const& x = data(active).input;
128  unsigned int y = data(active).label;
129  SHARK_ASSERT(y < 2);
130 
131  // compute the prediction
132  double f = sigma * inner_prod(w, x);
133  predictions++;
134 
135  // compute the loss gradient
136  lg(x, y, f, gradient);
137  }
138 
139  // update
140  sigma *= (1.0 - 1.0 / (double)t);
141  if (nonzero)
142  {
143  double eta = 1.0 / (sigma * lambda * t * batchsize);
144  gradient *= eta;
145  norm_w2 += inner_prod(gradient, gradient) - 2.0 * inner_prod(w, gradient);
146  noalias(w) -= gradient;
147 
148  // project to the ball
149  double n2 = sigma * sigma * norm_w2;
150  if (n2 > normbound2) sigma *= std::sqrt(normbound2 / n2);
151  }
152  }
153 
154  // rescale the solution
155  w *= sigma;
156  return predictions;
157  }
158 
159 protected:
160  // gradient of the loss
161  static bool lg(
162  VectorType const& x,
163  unsigned int y,
164  double f,
165  VectorType& gradient)
166  {
167  if (y == 0)
168  {
169  if (f > -1.0)
170  {
171  gradient += x;
172  return true;
173  }
174  }
175  else if (y == 1)
176  {
177  if (f < 1.0)
178  {
179  gradient -= x;
180  return true;
181  }
182  }
183  return false;
184  }
185 };
186 
187 
188 ///
189 /// \brief Pegasos solver for linear multi-class support vector machines.
190 ///
191 template <class VectorType>
193 {
194 public:
195  /// \brief Multi-class margin type.
197  {
200  };
201 
202  /// \brief Multi-class loss function type.
204  {
210  };
211 
212  /// \brief Solve the primal multi-class SVM problem.
213  ///
214  /// In addition to "standard" Pegasos this solver checks a
215  /// meaningful stopping criterion.
216  ///
217  /// The function returns the number of model predictions
218  /// during training (this is comparable to SMO iterations).
219  template <class WeightType>
220  static std::size_t solve(
221  LabeledData<VectorType, unsigned int> const& data, ///< training data
222  eMarginType margintype, ///< margin function type
223  eLossType losstype, ///< loss function type
224  bool sumToZero, ///< enforce the sum-to-zero constraint?
225  double C, ///< SVM regularization parameter
226  std::vector<WeightType>& w, ///< class-wise weight vectors
227  std::size_t batchsize = 1, ///< number of samples in each gradient estimate
228  double varepsilon = 0.001) ///< solution accuracy (factor by which the primal gradient should be reduced)
229  {
230  SHARK_ASSERT(batchsize > 0);
231  std::size_t ell = data.numberOfElements();
232  unsigned int classes = w.size();
233  SHARK_ASSERT(classes >= 2);
234  double lambda = 1.0 / (ell * C);
235 
236  double initialPrimal = -1.0;
237  LossGradientFunction lg = NULL;
238  if (margintype == emRelative)
239  {
240  if (losstype == elDiscriminativeMax || losstype == elTotalMax)
241  {
242  // CS case
243  initialPrimal = 1.0;
244  lg = lossGradientRDM;
245  }
246  else if (losstype == elDiscriminativeSum || losstype == elTotalSum)
247  {
248  // WW case
249  initialPrimal = classes - 1.0;
250  lg = lossGradientRDS;
251  }
252  }
253  else if (margintype == emAbsolute)
254  {
255  if (losstype == elNaiveHinge)
256  {
257  // MMR case
258  initialPrimal = 1.0;
259  lg = lossGradientANH;
260  }
261  else if (losstype == elDiscriminativeMax)
262  {
263  // ADM case
264  initialPrimal = 1.0;
265  lg = lossGradientADM;
266  }
267  else if (losstype == elDiscriminativeSum)
268  {
269  // LLW case
270  initialPrimal = classes - 1.0;
271  lg = lossGradientADS;
272  }
273  else if (losstype == elTotalMax)
274  {
275  // ATM case
276  initialPrimal = 1.0;
277  lg = lossGradientATM;
278  }
279  else if (losstype == elTotalSum)
280  {
281  // ATS/OVA case
282  initialPrimal = classes;
283  lg = lossGradientATS;
284  }
285  }
286  SHARK_RUNTIME_CHECK(initialPrimal > 0 && lg, "The combination of margin and loss is not implemented");
287 
288  double normbound2 = initialPrimal / lambda; // upper bound for |sigma * w|^2
289  double norm_w2 = 0.0; // squared norm of w
290  double sigma = 1.0; // scaling factor for w
291  double target = initialPrimal * varepsilon; // target gradient norm
292  std::vector<VectorType> gradient(classes); // gradient (to be computed in each iteration)
293  RealVector f(classes); // machine prediction (computed for each example)
294  for (unsigned int c=0; c<classes; c++)
295  {
296  gradient[c].resize(w[c].size());
297  w[c] = RealZeroVector(w[c].size());
298  }
299 
300  // pegasos main loop
301  std::size_t start = 10;
302  std::size_t checkinterval = (2 * ell) / batchsize;
303  std::size_t nextcheck = start + ell / batchsize;
304  std::size_t predictions = 0;
305  for (std::size_t t=start; ; t++)
306  {
307  // check the stopping criterion: \|gradient\| < epsilon ?
308  if (t >= nextcheck)
309  {
310  // compute the gradient
311  for (unsigned int c=0; c<classes; c++) gradient[c] = (lambda * sigma * (double)ell) * w[c];
312  for (std::size_t i=0; i<ell; i++)
313  {
314  VectorType const& x = data(i).input;
315  unsigned int y = data(i).label;
316  for (unsigned int c=0; c<classes; c++) f(c) = sigma * inner_prod(w[c], x);
317  lg(x, y, f, gradient, sumToZero);
318  }
319  predictions += ell;
320 
321  // compute the norm of the gradient
322  double n2 = 0.0;
323  for (unsigned int c=0; c<classes; c++) n2 += inner_prod(gradient[c], gradient[c]);
324  double n = std::sqrt(n2) / (double)ell;
325 
326  // check the stopping criterion
327  if (n < target)
328  {
329 // std::cout << "target accuracy reached." << std::endl;
330 // std::cout << "accuracy: " << n << std::endl;
331 // std::cout << "predictions: " << predictions << std::endl;
332  break;
333  }
334 
335  nextcheck = t + checkinterval;
336  }
337 
338  // compute the gradient
339  for (unsigned int c=0; c<classes; c++) gradient[c].clear();
340  bool nonzero = true;
341  for (unsigned int i=0; i<batchsize; i++)
342  {
343  // select the active variable (sample with replacement)
344  std::size_t active = random::discrete(0, ell-1);
345  VectorType const& x = data(active).input;
346  unsigned int y = data(active).label;
347  SHARK_ASSERT(y < classes);
348 
349  // compute the prediction
350  for (unsigned int c=0; c<classes; c++) f(c) = sigma * inner_prod(w[c], x);
351  predictions++;
352 
353  // compute the loss gradient
354  lg(x, y, f, gradient, sumToZero);
355  }
356 
357  // update
358  sigma *= (1.0 - 1.0 / (double)t);
359  if (nonzero)
360  {
361  double eta = 1.0 / (sigma * lambda * t * batchsize);
362  for (unsigned int c=0; c<classes; c++)
363  {
364  gradient[c] *= eta;
365  norm_w2 += inner_prod(gradient[c], gradient[c]) - 2.0 * inner_prod(w[c], gradient[c]);
366  noalias(w[c]) -= gradient[c];
367  }
368 
369  // project to the ball
370  double n2 = sigma * sigma * norm_w2;
371  if (n2 > normbound2) sigma *= std::sqrt(normbound2 / n2);
372  }
373  }
374 
375  // rescale the solution
376  for (unsigned int c=0; c<classes; c++) w[c] *= sigma;
377  return predictions;
378  }
379 
380 protected:
381  // Function type for the computation of the gradient
382  // of the loss. A return value of true indicates that
383  // the gradient is non-zero.
384  typedef bool(*LossGradientFunction)(VectorType const&, unsigned int, RealVector const&, std::vector<VectorType>&, bool);
385 
386  // absolute margin, naive hinge loss
387  static bool lossGradientANH(
388  VectorType const& x,
389  unsigned int y,
390  RealVector const& f,
391  std::vector<VectorType>& gradient,
392  bool sumToZero)
393  {
394  if (f(y) < 1.0)
395  {
396  gradient[y] -= x;
397  if (sumToZero)
398  {
399  VectorType xx = (1.0 / (f.size() - 1.0)) * x;
400  for (std::size_t c=0; c<f.size(); c++) if (c != y) gradient[c] += xx;
401  }
402  return true;
403  }
404  else return false;
405  }
406 
407  // relative margin, max loss
408  static bool lossGradientRDM(
409  VectorType const& x,
410  unsigned int y,
411  RealVector const& f,
412  std::vector<VectorType>& gradient,
413  bool sumToZero)
414  {
415  unsigned int argmax = 0;
416  double max = -1e100;
417  for (std::size_t c=0; c<f.size(); c++)
418  {
419  if (c != y && f(c) > max)
420  {
421  max = f(c);
422  argmax = c;
423  }
424  }
425  if (f(y) < 1.0 + max)
426  {
427  gradient[y] -= x;
428  gradient[argmax] += x;
429  return true;
430  }
431  else return false;
432  }
433 
434  // relative margin, sum loss
435  static bool lossGradientRDS(
436  VectorType const& x,
437  unsigned int y,
438  RealVector const& f,
439  std::vector<VectorType>& gradient,
440  bool sumToZero)
441  {
442  bool nonzero = false;
443  for (std::size_t c=0; c<f.size(); c++)
444  {
445  if (c != y && f(y) < 1.0 + f(c))
446  {
447  gradient[y] -= x;
448  gradient[c] += x;
449  nonzero = true;
450  }
451  }
452  return nonzero;
453  }
454 
455  // absolute margin, discriminative sum loss
456  static bool lossGradientADS(
457  VectorType const& x,
458  unsigned int y,
459  RealVector const& f,
460  std::vector<VectorType>& gradient,
461  bool sumToZero)
462  {
463  bool nonzero = false;
464  for (std::size_t c=0; c<f.size(); c++)
465  {
466  if (c != y && f(c) > -1.0)
467  {
468  gradient[c] += x;
469  nonzero = true;
470  }
471  }
472  if (sumToZero && nonzero)
473  {
474  VectorType mean = gradient[0];
475  for (std::size_t c=1; c<f.size(); c++) mean += gradient[c];
476  mean /= f.size();
477  for (std::size_t c=0; c<f.size(); c++) gradient[c] -= mean;
478  }
479  return nonzero;
480  }
481 
482  // absolute margin, discriminative max loss
483  static bool lossGradientADM(
484  VectorType const& x,
485  unsigned int y,
486  RealVector const& f,
487  std::vector<VectorType>& gradient,
488  bool sumToZero)
489  {
490  double max = -1e100;
491  std::size_t argmax = 0;
492  for (std::size_t c=0; c<f.size(); c++)
493  {
494  if (c == y) continue;
495  if (f(c) > max)
496  {
497  max = f(c);
498  argmax = c;
499  }
500  }
501  if (max > -1.0)
502  {
503  gradient[argmax] += x;
504  if (sumToZero)
505  {
506  VectorType xx = (1.0 / (f.size() - 1.0)) * x;
507  for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] -= xx;
508  }
509  return true;
510  }
511  else return false;
512  }
513 
514  // absolute margin, total sum loss
515  static bool lossGradientATS(
516  VectorType const& x,
517  unsigned int y,
518  RealVector const& f,
519  std::vector<VectorType>& gradient,
520  bool sumToZero)
521  {
522  bool nonzero = false;
523  for (std::size_t c=0; c<f.size(); c++)
524  {
525  if (c == y)
526  {
527  if (f(c) < 1.0)
528  {
529  gradient[c] -= x;
530  nonzero = true;
531  }
532  }
533  else
534  {
535  if (f(c) > -1.0)
536  {
537  gradient[c] += x;
538  nonzero = true;
539  }
540  }
541  }
542  if (sumToZero && nonzero)
543  {
544  VectorType mean = gradient[0];
545  for (std::size_t c=1; c<f.size(); c++) mean += gradient[c];
546  mean /= f.size();
547  for (std::size_t c=0; c<f.size(); c++) gradient[c] -= mean;
548  }
549  return nonzero;
550  }
551 
552  // absolute margin, total max loss
553  static bool lossGradientATM(
554  VectorType const& x,
555  unsigned int y,
556  RealVector const& f,
557  std::vector<VectorType>& gradient,
558  bool sumToZero)
559  {
560  double max = -1e100;
561  std::size_t argmax = 0;
562  for (std::size_t c=0; c<f.size(); c++)
563  {
564  if (c == y)
565  {
566  if (-f(c) > max)
567  {
568  max = -f(c);
569  argmax = c;
570  }
571  }
572  else
573  {
574  if (f(c) > max)
575  {
576  max = f(c);
577  argmax = c;
578  }
579  }
580  }
581  if (max > -1.0)
582  {
583  if (argmax == y)
584  {
585  gradient[argmax] -= x;
586  if (sumToZero)
587  {
588  VectorType xx = (1.0 / (f.size() - 1.0)) * x;
589  for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] += xx;
590  }
591  }
592  else
593  {
594  gradient[argmax] += x;
595  if (sumToZero)
596  {
597  VectorType xx = (1.0 / (f.size() - 1.0)) * x;
598  for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] -= xx;
599  }
600  }
601  return true;
602  }
603  else return false;
604  }
605 };
606 
607 
608 }
609 #endif