QpMcLinear.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solvers for linear multi-class SVM training without bias.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date -
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_QP_QPMCLINEAR_H
37 #define SHARK_ALGORITHMS_QP_QPMCLINEAR_H
38 
39 #include <shark/Core/Timer.h>
41 #include <shark/Data/Dataset.h>
42 #include <shark/Data/DataView.h>
43 #include <shark/LinAlg/Base.h>
44 #include <cmath>
45 #include <iostream>
46 #include <vector>
47 
48 
49 namespace shark {
50 
51 
52 /// \brief Generic solver skeleton for linear multi-class SVM problems.
53 template <class InputT>
55 {
56 public:
60 
62 
63 
64  ///
65  /// \brief Constructor
66  ///
67  /// \param dataset training data
68  /// \param dim problem dimension
69  /// \param classes number of classes in the problem
70  /// \param strategy coordinate selection strategy
71  /// \param shrinking flag turning shrinking on and off
72  ///
74  const DatasetType& dataset,
75  std::size_t dim,
76  std::size_t classes,
77  std::size_t strategy = ACF,
78  bool shrinking = false)
79  : m_data(dataset)
80  , m_xSquared(dataset.numberOfElements())
81  , m_dim(dim)
82  , m_classes(classes)
83  , m_strategy(strategy)
84  , m_shrinking(shrinking)
85  {
86  SHARK_ASSERT(m_dim > 0);
87 
88  for (std::size_t i=0; i<m_data.size(); i++)
89  {
90  m_xSquared(i) = inner_prod(m_data[i].input, m_data[i].input);
91  }
92  }
93 
94  ///
95  /// \brief Solve the SVM training problem.
96  ///
97  /// \param rng random number generator used by the algorithm
98  /// \param C regularization constant of the SVM
99  /// \param stop stopping condition(s)
100  /// \param prop solution properties
101  /// \param verbose if true, the solver prints status information and solution statistics
102  ///
103  RealMatrix solve(
104  random::rng_type& rng,
105  double C,
106  QpStoppingCondition& stop,
107  QpSolutionProperties* prop = NULL,
108  bool verbose = false)
109  {
110  // sanity checks
111  SHARK_ASSERT(C > 0.0);
112 
113  // measure training time
114  Timer timer;
115  timer.start();
116 
117  // prepare dimensions and vectors
118  std::size_t ell = m_data.size(); // number of training examples
119  RealMatrix alpha(ell, m_classes + 1, 0.0); // Lagrange multipliers; dual variables. Reserve one extra column.
120  RealMatrix w(m_classes, m_dim, 0.0); // weight vectors; primal variables
121 
122  // scheduling of steps, for ACF only
123  RealVector pref(ell, 1.0); // example-wise measure of success
124  double prefsum = (double)ell; // normalization constant
125 
126  std::vector<std::size_t> schedule(ell);
127  if (m_strategy == UNIFORM)
128  {
129  for (std::size_t i=0; i<ell; i++) schedule[i] = i;
130  }
131 
132  // used for shrinking
133  std::size_t active = ell;
134 
135  // prepare counters
136  std::size_t epoch = 0;
137  std::size_t steps = 0;
138 
139  // prepare performance monitoring
140  double objective = 0.0;
141  double max_violation = 0.0;
142 
143  // gain for ACF
144  const double gain_learning_rate = 1.0 / ell;
145  double average_gain = 0.0;
146 
147 
148  // outer optimization loop (epochs)
149  bool canstop = true;
150  while (true)
151  {
152  if (m_strategy == ACF)
153  {
154  // define schedule
155  double psum = prefsum;
156  prefsum = 0.0;
157  std::size_t pos = 0;
158  for (std::size_t i=0; i<ell; i++)
159  {
160  double p = pref(i);
161  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
162  std::size_t n = (std::size_t)std::floor(num);
163  double prob = num - n;
164  //~ if (random::coinToss(rng,prob)) n++;
165  if (random::uni(rng) < prob) n++;
166  for (std::size_t j=0; j<n; j++)
167  {
168  schedule[pos] = i;
169  pos++;
170  }
171  psum -= p;
172  prefsum += p;
173  }
174  SHARK_ASSERT(pos == ell);
175  }
176 
177  if (m_shrinking == true)
178  {
179  //~ for (std::size_t i=0; i<active; i++)
180  //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), active - 1)]);
181  std::shuffle(schedule.begin(),schedule.begin()+active,rng);
182  }
183  else
184  {
185  //~ for (std::size_t i=0; i<ell; i++)
186  //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), ell - 1)]);
187  std::shuffle(schedule.begin(),schedule.end(),rng);
188  }
189 
190  // inner loop (one epoch)
191  max_violation = 0.0;
192  size_t nPoints = ell;
193  if (m_shrinking == true)
194  nPoints = active;
195 
196  for (std::size_t j=0; j<nPoints; j++)
197  {
198  // active example
199  double gain = 0.0;
200  const std::size_t i = schedule[j];
201  InputReferenceType x_i = m_data[i].input;
202  const unsigned int y_i = m_data[i].label;
203  const double q = m_xSquared(i);
204  blas::matrix_row<RealMatrix> a = row(alpha, i);
205 
206  // compute gradient and KKT violation
207  RealVector wx = prod(w,x_i);
208  RealVector g(m_classes);
209  double kkt = calcGradient(g, wx, a, C, y_i);
210 
211  if (kkt > 0.0)
212  {
213  max_violation = std::max(max_violation, kkt);
214 
215  // perform the step on alpha
216  RealVector mu(m_classes, 0.0);
217  gain = solveSub(0.1 * stop.minAccuracy, g, q, C, y_i, a, mu);
218  objective += gain;
219  steps++;
220 
221  // update weight vectors
222  updateWeightVectors(w, mu, i);
223  }
224  else if (m_shrinking == true)
225  {
226  active--;
227  std::swap(schedule[j], schedule[active]);
228  j--;
229  }
230 
231  // update gain-based preferences
232  if (m_strategy == ACF)
233  {
234  if (epoch == 0) average_gain += gain / (double)ell;
235  else
236  {
237  // strategy constants
238  constexpr double CHANGE_RATE = 0.2;
239  constexpr double PREF_MIN = 0.05;
240  constexpr double PREF_MAX = 20.0;
241 
242  double change = CHANGE_RATE * (gain / average_gain - 1.0);
243  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
244  prefsum += newpref - pref(i);
245  pref(i) = newpref;
246  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
247  }
248  }
249  }
250 
251  epoch++;
252 
253  // stopping criteria
254  if (stop.maxIterations > 0 && epoch * ell >= stop.maxIterations)
255  {
256  if (prop != NULL) prop->type = QpMaxIterationsReached;
257  break;
258  }
259 
260  if (timer.stop() >= stop.maxSeconds)
261  {
262  if (prop != NULL) prop->type = QpTimeout;
263  break;
264  }
265 
266  if (max_violation < stop.minAccuracy)
267  {
268  if (verbose)
269  std::cout << "#" << std::flush;
270  if (canstop)
271  {
272  if (prop != NULL) prop->type = QpAccuracyReached;
273  break;
274  }
275  else
276  {
277  if (m_strategy == ACF)
278  {
279  // prepare full sweep for a reliable checking of the stopping criterion
280  canstop = true;
281  for (std::size_t i=0; i<ell; i++) pref(i) = 1.0;
282  prefsum = (double)ell;
283  }
284 
285  if (m_shrinking == true)
286  {
287  // prepare full sweep for a reliable checking of the stopping criterion
288  active = ell;
289  canstop = true;
290  }
291  }
292  }
293  else
294  {
295  if (verbose) std::cout << "." << std::flush;
296  if (m_strategy == ACF)
297  canstop = false;
298  if (m_shrinking == true)
299  canstop = (active == ell);
300  }
301  }
302  timer.stop();
303 
304  // calculate dual objective value
305  objective = 0.0;
306  for (std::size_t j=0; j<m_classes; j++)
307  {
308  for (std::size_t d=0; d<m_dim; d++) objective -= w(j, d) * w(j, d);
309  }
310  objective *= 0.5;
311  for (std::size_t i=0; i<ell; i++)
312  {
313  for (std::size_t j=0; j<m_classes; j++) objective += alpha(i, j);
314  }
315 
316  // return solution statistics
317  if (prop != NULL)
318  {
319  prop->accuracy = max_violation; // this is approximate, but a good guess
320  prop->iterations = ell * epoch;
321  prop->value = objective;
322  prop->seconds = timer.lastLap();
323  }
324 
325  // output solution statistics
326  if (verbose)
327  {
328  std::cout << std::endl;
329  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
330  std::cout << "number of epochs: " << epoch << std::endl;
331  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
332  std::cout << "number of non-zero steps: " << steps << std::endl;
333  std::cout << "dual accuracy: " << max_violation << std::endl;
334  std::cout << "dual objective value: " << objective << std::endl;
335  }
336 
337  // return the solution
338  return w;
339  }
340 
341 protected:
342  // for all c: row(w, c) += mu(c) * x
343  void add_scaled(RealMatrix& w, RealVector const& mu, InputReferenceType x)
344  {
345  for (std::size_t c=0; c<m_classes; c++) noalias(row(w, c)) += mu(c) * x;
346  }
347 
348  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
349  ///
350  /// \param gradient gradient vector to be filled in. The vector is correctly sized.
351  /// \param wx inner products of weight vectors with the current sample; wx(c) = <w_c, x>
352  /// \param alpha variables corresponding to the current sample
353  /// \param C upper bound on the variables
354  /// \param y label of the current sample
355  ///
356  /// \return The function must return the violation of the KKT conditions.
357  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y) = 0;
358 
359  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
360  ///
361  /// \param w matrix of (dense) weight vectors (as rows)
362  /// \param mu dual step on the variables corresponding to the current sample
363  /// \param index current sample
364  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index) = 0;
365 
366  /// \brief Solve the sub-problem posed by a single training example.
367  ///
368  /// \param epsilon accuracy (dual gradient) up to which the sub-problem should be solved
369  /// \param gradient gradient of the objective function w.r.t. alpha
370  /// \param q squared norm of the current sample
371  /// \param C upper bound on the variables
372  /// \param y label of the current sample
373  /// \param alpha input: initial point; output: (near) optimal point
374  /// \param mu step from initial point to final point
375  ///
376  /// \return The function must return the gain of the step, i.e., the improvement of the objective function.
377  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu) = 0;
378 
379  DataView<const DatasetType> m_data; ///< view on training data
380  RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
381  std::size_t m_dim; ///< input space dimension
382  std::size_t m_classes; ///< number of classes
383  std::size_t m_strategy; ///< strategy for coordinate selection
384  bool m_shrinking; ///< apply shrinking or not?
385 };
386 
387 /// \brief Solver for the multi-class SVM by Weston & Watkins.
388 template <class InputT>
389 class QpMcLinearWW : public QpMcLinear<InputT>
390 {
391 public:
393 
394  /// \brief Constructor
396  const DatasetType& dataset,
397  std::size_t dim,
398  std::size_t classes)
399  : QpMcLinear<InputT>(dataset, dim, classes)
400  { }
401 
402 protected:
403  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
404  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
405  {
406  double violation = 0.0;
407  for (std::size_t c=0; c<wx.size(); c++)
408  {
409  if (c == y)
410  {
411  gradient(c) = 0.0;
412  }
413  else
414  {
415  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
416  gradient(c) = g;
417  if (g > violation && alpha(c) < C) violation = g;
418  else if (-g > violation && alpha(c) > 0.0) violation = -g;
419  }
420  }
421  return violation;
422  }
423 
424  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
425  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
426  {
427  double sum_mu = 0.0;
428  for (std::size_t c=0; c<m_classes; c++) sum_mu += mu(c);
429  unsigned int y = m_data[index].label;
430  RealVector step(-0.5 * mu);
431  step(y) = 0.5 * sum_mu;
432  add_scaled(w, step, m_data[index].input);
433  }
434 
435  /// \brief Solve the sub-problem posed by a single training example.
436  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
437  {
438  const double qq = 0.5 * q;
439  double gain = 0.0;
440 
441  // SMO loop
442  size_t iter, maxiter = 10 * m_classes;
443  for (iter=0; iter<maxiter; iter++)
444  {
445  // select working set
446  std::size_t idx = 0;
447  double kkt = 0.0;
448  for (std::size_t c=0; c<m_classes; c++)
449  {
450  if (c == y) continue;
451 
452  const double g = gradient(c);
453  const double a = alpha(c);
454  if (g > kkt && a < C) { kkt = g; idx = c; }
455  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
456  }
457 
458  // check stopping criterion
459  if (kkt < epsilon) break;
460 
461  // perform step
462  const double a = alpha(idx);
463  const double g = gradient(idx);
464  double m = g / qq;
465  double a_new = a + m;
466  if (a_new <= 0.0)
467  {
468  m = -a;
469  a_new = 0.0;
470  }
471  else if (a_new >= C)
472  {
473  m = C - a;
474  a_new = C;
475  }
476  alpha(idx) = a_new;
477  mu(idx) += m;
478 
479  // update gradient and total gain
480  const double dg = 0.5 * m * qq;
481  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
482  gradient(idx) -= dg;
483 
484  gain += m * (g - dg);
485  }
486 
487  return gain;
488  }
489 
490 protected:
494 };
495 
496 
497 /// \brief Solver for the multi-class SVM by Lee, Lin & Wahba.
498 template <class InputT>
499 class QpMcLinearLLW : public QpMcLinear<InputT>
500 {
501 public:
503 
504  /// \brief Constructor
506  const DatasetType& dataset,
507  std::size_t dim,
508  std::size_t classes)
509  : QpMcLinear<InputT>(dataset, dim, classes)
510  { }
511 
512 protected:
513  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
514  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
515  {
516  double violation = 0.0;
517  for (std::size_t c=0; c<m_classes; c++)
518  {
519  if (c == y)
520  {
521  gradient(c) = 0.0;
522  }
523  else
524  {
525  const double g = 1.0 + wx(c);
526  gradient(c) = g;
527  if (g > violation && alpha(c) < C) violation = g;
528  else if (-g > violation && alpha(c) > 0.0) violation = -g;
529  }
530  }
531  return violation;
532  }
533 
534  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
535  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
536  {
537  double mean_mu = 0.0;
538  for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
539  mean_mu /= (double)m_classes;
540  RealVector step(m_classes);
541  for (std::size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
542  add_scaled(w, step, m_data[index].input);
543  }
544 
545  /// \brief Solve the sub-problem posed by a single training example.
546  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
547  {
548  const double ood = 1.0 / m_classes;
549  const double qq = (1.0 - ood) * q;
550  double gain = 0.0;
551 
552  // SMO loop
553  size_t iter, maxiter = 10 * m_classes;
554  for (iter=0; iter<maxiter; iter++)
555  {
556  // select working set
557  std::size_t idx = 0;
558  double kkt = 0.0;
559  for (std::size_t c=0; c<m_classes; c++)
560  {
561  if (c == y) continue;
562 
563  const double g = gradient(c);
564  const double a = alpha(c);
565  if (g > kkt && a < C) { kkt = g; idx = c; }
566  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
567  }
568 
569  // check stopping criterion
570  if (kkt < epsilon) break;
571 
572  // perform step
573  const double a = alpha(idx);
574  const double g = gradient(idx);
575  double m = g / qq;
576  double a_new = a + m;
577  if (a_new <= 0.0)
578  {
579  m = -a;
580  a_new = 0.0;
581  }
582  else if (a_new >= C)
583  {
584  m = C - a;
585  a_new = C;
586  }
587  alpha(idx) = a_new;
588  mu(idx) += m;
589 
590  // update gradient and total gain
591  const double dg = m * q;
592  const double dgc = dg / m_classes;
593  for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
594  gradient(idx) -= dg;
595 
596  gain += m * (g - 0.5 * (dg - dgc));
597  }
598 
599  return gain;
600  }
601 
602 protected:
606 };
607 
608 
609 /// \brief Solver for the multi-class SVM with absolute margin and total sum loss.
610 template <class InputT>
611 class QpMcLinearATS : public QpMcLinear<InputT>
612 {
613 public:
615 
616  /// \brief Constructor
618  const DatasetType& dataset,
619  std::size_t dim,
620  std::size_t classes)
621  : QpMcLinear<InputT>(dataset, dim, classes)
622  { }
623 
624 protected:
625  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
626  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
627  {
628  double violation = 0.0;
629  for (std::size_t c=0; c<m_classes; c++)
630  {
631  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
632  gradient(c) = g;
633  if (g > violation && alpha(c) < C) violation = g;
634  else if (-g > violation && alpha(c) > 0.0) violation = -g;
635  }
636  return violation;
637  }
638 
639  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
640  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
641  {
642  unsigned int y = m_data[index].label;
643  double mean = -2.0 * mu(y);
644  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
645  mean /= (double)m_classes;
646  RealVector step(m_classes);
647  for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
648  add_scaled(w, step, m_data[index].input);
649  }
650 
651  /// \brief Solve the sub-problem posed by a single training example.
652  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
653  {
654  const double ood = 1.0 / m_classes;
655  const double qq = (1.0 - ood) * q;
656  double gain = 0.0;
657 
658  // SMO loop
659  size_t iter, maxiter = 10 * m_classes;
660  for (iter=0; iter<maxiter; iter++)
661  {
662  // select working set
663  std::size_t idx = 0;
664  double kkt = 0.0;
665  for (std::size_t c=0; c<m_classes; c++)
666  {
667  const double g = gradient(c);
668  const double a = alpha(c);
669  if (g > kkt && a < C) { kkt = g; idx = c; }
670  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
671  }
672 
673  // check stopping criterion
674  if (kkt < epsilon) break;
675 
676  // perform step
677  const double a = alpha(idx);
678  const double g = gradient(idx);
679  double m = g / qq;
680  double a_new = a + m;
681  if (a_new <= 0.0)
682  {
683  m = -a;
684  a_new = 0.0;
685  }
686  else if (a_new >= C)
687  {
688  m = C - a;
689  a_new = C;
690  }
691  alpha(idx) = a_new;
692  mu(idx) += m;
693 
694  // update gradient and total gain
695  const double dg = m * q;
696  const double dgc = dg / m_classes;
697  if (idx == y)
698  {
699  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
700  gradient(idx) -= dg - 2.0 * dgc;
701  }
702  else
703  {
704  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
705  gradient(idx) -= dg;
706  }
707 
708  gain += m * (g - 0.5 * (dg - dgc));
709  }
710 
711  return gain;
712  }
713 
714 protected:
718 };
719 
720 
721 /// \brief Solver for the multi-class maximum margin regression SVM
722 template <class InputT>
723 class QpMcLinearMMR : public QpMcLinear<InputT>
724 {
725 public:
727 
728  /// \brief Constructor
730  const DatasetType& dataset,
731  std::size_t dim,
732  std::size_t classes)
733  : QpMcLinear<InputT>(dataset, dim, classes)
734  { }
735 
736 protected:
737  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
738  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
739  {
740  for (std::size_t c=0; c<m_classes; c++) gradient(c) = 0.0;
741  const double g = 1.0 - wx(y);
742  gradient(y) = g;
743  const double a = alpha(0);
744  if (g > 0.0)
745  {
746  if (a == C) return 0.0;
747  else return g;
748  }
749  else
750  {
751  if (a == 0.0) return 0.0;
752  else return -g;
753  }
754  }
755 
756  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
757  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
758  {
759  unsigned int y = m_data[index].label;
760  double s = mu(0);
761  double sc = -s / m_classes;
762  double sy = s + sc;
763  RealVector step(m_classes);
764  for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? sy : sc;
765  add_scaled(w, step, m_data[index].input);
766  }
767 
768  /// \brief Solve the sub-problem posed by a single training example.
769  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
770  {
771  const double ood = 1.0 / m_classes;
772  const double qq = (1.0 - ood) * q;
773 
774  double kkt = 0.0;
775  const double g = gradient(y);
776  const double a = alpha(0);
777  if (g > kkt && a < C) kkt = g;
778  else if (-g > kkt && a > 0.0) kkt = -g;
779 
780  // check stopping criterion
781  if (kkt < epsilon) return 0.0;
782 
783  // perform step
784  double m = g / qq;
785  double a_new = a + m;
786  if (a_new <= 0.0)
787  {
788  m = -a;
789  a_new = 0.0;
790  }
791  else if (a_new >= C)
792  {
793  m = C - a;
794  a_new = C;
795  }
796  alpha(0) = a_new;
797  mu(0) = m;
798 
799  // return the gain
800  return m * (g - 0.5 * m * qq);
801  }
802 
803 protected:
807 };
808 
809 
810 /// \brief Solver for the multi-class SVM by Crammer & Singer.
811 template <class InputT>
812 class QpMcLinearCS : public QpMcLinear<InputT>
813 {
814 public:
816 
817  /// \brief Constructor
819  const DatasetType& dataset,
820  std::size_t dim,
821  std::size_t classes)
822  : QpMcLinear<InputT>(dataset, dim, classes)
823  { }
824 
825 protected:
826  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
827  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
828  {
829  if (alpha(m_classes) < C)
830  {
831  double violation = 0.0;
832  for (std::size_t c=0; c<wx.size(); c++)
833  {
834  if (c == y)
835  {
836  gradient(c) = 0.0;
837  }
838  else
839  {
840  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
841  gradient(c) = g;
842  if (g > violation) violation = g;
843  else if (-g > violation && alpha(c) > 0.0) violation = -g;
844  }
845  }
846  return violation;
847  }
848  else
849  {
850  // double kkt_up = -1e100, kkt_down = 1e100;
851  double kkt_up = 0.0, kkt_down = 1e100;
852  for (std::size_t c=0; c<m_classes; c++)
853  {
854  if (c == y)
855  {
856  gradient(c) = 0.0;
857  }
858  else
859  {
860  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
861  gradient(c) = g;
862  if (g > kkt_up && alpha(c) < C) kkt_up = g;
863  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
864  }
865  }
866  return std::max(0.0, kkt_up - kkt_down);
867  }
868  }
869 
870  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
871  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
872  {
873  unsigned int y = m_data[index].label;
874  double sum_mu = 0.0;
875  for (std::size_t c=0; c<m_classes; c++) if (c != y) sum_mu += mu(c);
876  RealVector step(-0.5 * mu);
877  step(y) = 0.5 * sum_mu;
878  add_scaled(w, step, m_data[index].input);
879  }
880 
881  /// \brief Solve the sub-problem posed by a single training example.
882  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
883  {
884  const double qq = 0.5 * q;
885  double gain = 0.0;
886 
887  // SMO loop
888  size_t iter, maxiter = 10 * m_classes;
889  for (iter=0; iter<maxiter; iter++)
890  {
891  // select working set
892  std::size_t idx = 0;
893  std::size_t idx_up = 0, idx_down = 0;
894  bool size2 = false;
895  double kkt = 0.0;
896  double grad = 0.0;
897  if (alpha(m_classes) == C)
898  {
899  double kkt_up = -1e100, kkt_down = 1e100;
900  for (std::size_t c=0; c<m_classes; c++)
901  {
902  if (c == y) continue;
903 
904  const double g = gradient(c);
905  const double a = alpha(c);
906  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
907  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
908  }
909 
910  if (kkt_up <= 0.0)
911  {
912  idx = idx_down;
913  grad = kkt_down;
914  kkt = -kkt_down;
915  }
916  else
917  {
918  grad = kkt_up - kkt_down;
919  kkt = grad;
920  size2 = true;
921  }
922  }
923  else
924  {
925  for (std::size_t c=0; c<m_classes; c++)
926  {
927  if (c == y) continue;
928 
929  const double g = gradient(c);
930  const double a = alpha(c);
931  if (g > kkt) { kkt = g; idx = c; }
932  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
933  }
934  grad = gradient(idx);
935  }
936 
937  // check stopping criterion
938  if (kkt < epsilon) return gain;
939 
940  if (size2)
941  {
942  // perform step
943  const double a_up = alpha(idx_up);
944  const double a_down = alpha(idx_down);
945  double m = grad / qq;
946  double a_up_new = a_up + m;
947  double a_down_new = a_down - m;
948  if (a_down_new <= 0.0)
949  {
950  m = a_down;
951  a_up_new = a_up + m;
952  a_down_new = 0.0;
953  }
954  alpha(idx_up) = a_up_new;
955  alpha(idx_down) = a_down_new;
956  mu(idx_up) += m;
957  mu(idx_down) -= m;
958 
959  // update gradient and total gain
960  const double dg = 0.5 * m * qq;
961  gradient(idx_up) -= dg;
962  gradient(idx_down) += dg;
963  gain += m * (grad - 2.0 * dg);
964  }
965  else
966  {
967  // perform step
968  const double a = alpha(idx);
969  const double a_sum = alpha(m_classes);
970  double m = grad / qq;
971  double a_new = a + m;
972  double a_sum_new = a_sum + m;
973  if (a_new <= 0.0)
974  {
975  m = -a;
976  a_new = 0.0;
977  a_sum_new = a_sum + m;
978  }
979  else if (a_sum_new >= C)
980  {
981  m = C - a_sum;
982  a_sum_new = C;
983  a_new = a + m;
984  }
985  alpha(idx) = a_new;
986  alpha(m_classes) = a_sum_new;
987  mu(idx) += m;
988 
989  // update gradient and total gain
990  const double dg = 0.5 * m * qq;
991  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
992  gradient(idx) -= dg;
993  gain += m * (grad - dg);
994  }
995  }
996 
997  return gain;
998  }
999 
1000 protected:
1004 };
1005 
1006 
1007 /// \brief Solver for the multi-class SVM with absolute margin and discriminative maximum loss.
1008 template <class InputT>
1009 class QpMcLinearADM : public QpMcLinear<InputT>
1010 {
1011 public:
1013 
1014  /// \brief Constructor
1016  const DatasetType& dataset,
1017  std::size_t dim,
1018  std::size_t classes)
1019  : QpMcLinear<InputT>(dataset, dim, classes)
1020  { }
1021 
1022 protected:
1023  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1024  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
1025  {
1026  if (alpha(m_classes) < C)
1027  {
1028  double violation = 0.0;
1029  for (std::size_t c=0; c<m_classes; c++)
1030  {
1031  if (c == y)
1032  {
1033  gradient(c) = 0.0;
1034  }
1035  else
1036  {
1037  const double g = 1.0 + wx(c);
1038  gradient(c) = g;
1039  if (g > violation) violation = g;
1040  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1041  }
1042  }
1043  return violation;
1044  }
1045  else
1046  {
1047  double kkt_up = 0.0, kkt_down = 1e100;
1048  for (std::size_t c=0; c<m_classes; c++)
1049  {
1050  if (c == y)
1051  {
1052  gradient(c) = 0.0;
1053  }
1054  else
1055  {
1056  const double g = 1.0 + wx(c);
1057  gradient(c) = g;
1058  if (g > kkt_up && alpha(c) < C) kkt_up = g;
1059  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
1060  }
1061  }
1062  return std::max(0.0, kkt_up - kkt_down);
1063  }
1064  }
1065 
1066  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1067  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1068  {
1069  double mean_mu = 0.0;
1070  for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
1071  mean_mu /= (double)m_classes;
1072  RealVector step(m_classes);
1073  for (size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
1074  add_scaled(w, step, m_data[index].input);
1075  }
1076 
1077  /// \brief Solve the sub-problem posed by a single training example.
1078  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
1079  {
1080  const double ood = 1.0 / m_classes;
1081  const double qq = (1.0 - ood) * q;
1082  double gain = 0.0;
1083 
1084  // SMO loop
1085  size_t iter, maxiter = 10 * m_classes;
1086  for (iter=0; iter<maxiter; iter++)
1087  {
1088  // select working set
1089  std::size_t idx = 0;
1090  std::size_t idx_up = 0, idx_down = 0;
1091  bool size2 = false;
1092  double kkt = 0.0;
1093  double grad = 0.0;
1094  if (alpha(m_classes) == C)
1095  {
1096  double kkt_up = -1e100, kkt_down = 1e100;
1097  for (std::size_t c=0; c<m_classes; c++)
1098  {
1099  if (c == y) continue;
1100 
1101  const double g = gradient(c);
1102  const double a = alpha(c);
1103  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
1104  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
1105  }
1106 
1107  if (kkt_up <= 0.0)
1108  {
1109  idx = idx_down;
1110  grad = kkt_down;
1111  kkt = -kkt_down;
1112  }
1113  else
1114  {
1115  grad = kkt_up - kkt_down;
1116  kkt = grad;
1117  size2 = true;
1118  }
1119  }
1120  else
1121  {
1122  for (std::size_t c=0; c<m_classes; c++)
1123  {
1124  if (c == y) continue;
1125 
1126  const double g = gradient(c);
1127  const double a = alpha(c);
1128  if (g > kkt) { kkt = g; idx = c; }
1129  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1130  }
1131  grad = gradient(idx);
1132  }
1133 
1134  // check stopping criterion
1135  if (kkt < epsilon) return gain;
1136 
1137  if (size2)
1138  {
1139  // perform step
1140  const double a_up = alpha(idx_up);
1141  const double a_down = alpha(idx_down);
1142  double m = grad / (2.0 * q);
1143  double a_up_new = a_up + m;
1144  double a_down_new = a_down - m;
1145  if (a_down_new <= 0.0)
1146  {
1147  m = a_down;
1148  a_up_new = a_up + m;
1149  a_down_new = 0.0;
1150  }
1151  alpha(idx_up) = a_up_new;
1152  alpha(idx_down) = a_down_new;
1153  mu(idx_up) += m;
1154  mu(idx_down) -= m;
1155 
1156  // update gradient and total gain
1157  const double dg = m * q;
1158  const double dgc = dg / m_classes;
1159  gradient(idx_up) -= dg;
1160  gradient(idx_down) += dg;
1161  gain += m * (grad - (dg - dgc));
1162  }
1163  else
1164  {
1165  // perform step
1166  const double a = alpha(idx);
1167  const double a_sum = alpha(m_classes);
1168  double m = grad / qq;
1169  double a_new = a + m;
1170  double a_sum_new = a_sum + m;
1171  if (a_new <= 0.0)
1172  {
1173  m = -a;
1174  a_new = 0.0;
1175  a_sum_new = a_sum + m;
1176  }
1177  else if (a_sum_new >= C)
1178  {
1179  m = C - a_sum;
1180  a_sum_new = C;
1181  a_new = a + m;
1182  }
1183  alpha(idx) = a_new;
1184  alpha(m_classes) = a_sum_new;
1185  mu(idx) += m;
1186 
1187  // update gradient and total gain
1188  const double dg = m * q;
1189  const double dgc = dg / m_classes;
1190  for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
1191  gradient(idx) -= dg;
1192  gain += m * (grad - 0.5 * (dg - dgc));
1193  }
1194  }
1195 
1196  return gain;
1197  }
1198 
1199 protected:
1203 };
1204 
1205 
1206 /// \brief Solver for the multi-class SVM with absolute margin and total maximum loss.
1207 template <class InputT>
1208 class QpMcLinearATM : public QpMcLinear<InputT>
1209 {
1210 public:
1212 
1213  /// \brief Constructor
1215  const DatasetType& dataset,
1216  std::size_t dim,
1217  std::size_t classes)
1218  : QpMcLinear<InputT>(dataset, dim, classes)
1219  { }
1220 
1221 protected:
1222  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1223  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
1224  {
1225  if (alpha(m_classes) < C)
1226  {
1227  double violation = 0.0;
1228  for (std::size_t c=0; c<m_classes; c++)
1229  {
1230  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
1231  gradient(c) = g;
1232  if (g > violation) violation = g;
1233  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1234  }
1235  return violation;
1236  }
1237  else
1238  {
1239  double kkt_up = 0.0, kkt_down = 1e100;
1240  for (std::size_t c=0; c<m_classes; c++)
1241  {
1242  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
1243  gradient(c) = g;
1244  if (g > kkt_up && alpha(c) < C) kkt_up = g;
1245  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
1246  }
1247  return std::max(0.0, kkt_up - kkt_down);
1248  }
1249  }
1250 
1251  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1252  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1253  {
1254  unsigned int y = m_data[index].label;
1255  double mean = -2.0 * mu(y);
1256  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
1257  mean /= (double)m_classes;
1258  RealVector step(m_classes);
1259  for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? (mu(c) + mean) : (mean - mu(c));
1260  add_scaled(w, step, m_data[index].input);
1261  }
1262 
1263  /// \brief Solve the sub-problem posed by a single training example.
1264  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
1265  {
1266  const double ood = 1.0 / m_classes;
1267  const double qq = (1.0 - ood) * q;
1268  double gain = 0.0;
1269 
1270  // SMO loop
1271  size_t iter, maxiter = 10 * m_classes;
1272  for (iter=0; iter<maxiter; iter++)
1273  {
1274  // select working set
1275  std::size_t idx = 0;
1276  std::size_t idx_up = 0, idx_down = 0;
1277  bool size2 = false;
1278  double kkt = 0.0;
1279  double grad = 0.0;
1280  if (alpha(m_classes) == C)
1281  {
1282  double kkt_up = -1e100, kkt_down = 1e100;
1283  for (std::size_t c=0; c<m_classes; c++)
1284  {
1285  const double g = gradient(c);
1286  const double a = alpha(c);
1287  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
1288  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
1289  }
1290 
1291  if (kkt_up <= 0.0)
1292  {
1293  idx = idx_down;
1294  grad = kkt_down;
1295  kkt = -kkt_down;
1296  }
1297  else
1298  {
1299  grad = kkt_up - kkt_down;
1300  kkt = grad;
1301  size2 = true;
1302  }
1303  }
1304  else
1305  {
1306  for (std::size_t c=0; c<m_classes; c++)
1307  {
1308  const double g = gradient(c);
1309  const double a = alpha(c);
1310  if (g > kkt) { kkt = g; idx = c; }
1311  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1312  }
1313  grad = gradient(idx);
1314  }
1315 
1316  // check stopping criterion
1317  if (kkt < epsilon) return gain;
1318 
1319  if (size2)
1320  {
1321  // perform step
1322  const double a_up = alpha(idx_up);
1323  const double a_down = alpha(idx_down);
1324  double m = grad / (2.0 * q);
1325  double a_up_new = a_up + m;
1326  double a_down_new = a_down - m;
1327  if (a_down_new <= 0.0)
1328  {
1329  m = a_down;
1330  a_up_new = a_up + m;
1331  a_down_new = 0.0;
1332  }
1333  alpha(idx_up) = a_up_new;
1334  alpha(idx_down) = a_down_new;
1335  mu(idx_up) += m;
1336  mu(idx_down) -= m;
1337 
1338  // update gradient and total gain
1339  const double dg = m * q;
1340  const double dgc = dg / m_classes;
1341  if (idx_up == y)
1342  {
1343  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1344  gradient(idx_up) -= dg - 2.0 * dgc;
1345  gradient(idx_down) += dg;
1346  }
1347  else if (idx_down == y)
1348  {
1349  gradient(idx_up) -= dg;
1350  gradient(idx_down) += dg - 2.0 * dgc;
1351  }
1352  else
1353  {
1354  gradient(idx_up) -= dg;
1355  gradient(idx_down) += dg;
1356  }
1357  gain += m * (grad - (dg - dgc));
1358  }
1359  else
1360  {
1361  // perform step
1362  const double a = alpha(idx);
1363  const double a_sum = alpha(m_classes);
1364  double m = grad / qq;
1365  double a_new = a + m;
1366  double a_sum_new = a_sum + m;
1367  if (a_new <= 0.0)
1368  {
1369  m = -a;
1370  a_new = 0.0;
1371  a_sum_new = a_sum + m;
1372  }
1373  else if (a_sum_new >= C)
1374  {
1375  m = C - a_sum;
1376  a_sum_new = C;
1377  a_new = a + m;
1378  }
1379  alpha(idx) = a_new;
1380  alpha(m_classes) = a_sum_new;
1381  mu(idx) += m;
1382 
1383  // update gradient and total gain
1384  const double dg = m * q;
1385  const double dgc = dg / m_classes;
1386  if (idx == y)
1387  {
1388  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1389  gradient(idx) -= dg - 2.0 * dgc;
1390  }
1391  else
1392  {
1393  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
1394  gradient(idx) -= dg;
1395  }
1396  gain += m * (grad - 0.5 * (dg - dgc));
1397  }
1398  }
1399 
1400  return gain;
1401  }
1402 
1403 protected:
1407 };
1408 
1409 
1410 /// \brief Solver for the "reinforced" multi-class SVM.
1411 template <class InputT>
1412 class QpMcLinearReinforced : public QpMcLinear<InputT>
1413 {
1414 public:
1416 
1417  /// \brief Constructor
1419  const DatasetType& dataset,
1420  std::size_t dim,
1421  std::size_t classes)
1422  : QpMcLinear<InputT>(dataset, dim, classes)
1423  { }
1424 
1425 protected:
1426  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1427  virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
1428  {
1429  double violation = 0.0;
1430  for (std::size_t c=0; c<m_classes; c++)
1431  {
1432  const double g = (c == y) ? (m_classes - 1.0) - wx(y) : 1.0 + wx(c);
1433  gradient(c) = g;
1434  if (g > violation && alpha(c) < C) violation = g;
1435  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1436  }
1437  return violation;
1438  }
1439 
1440  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1441  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1442  {
1443  unsigned int y = m_data[index].label;
1444  double mean = -2.0 * mu(y);
1445  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
1446  mean /= (double)m_classes;
1447  RealVector step(m_classes);
1448  for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
1449  add_scaled(w, step, m_data[index].input);
1450  }
1451 
1452  /// \brief Solve the sub-problem posed by a single training example.
1453  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
1454  {
1455  const double ood = 1.0 / m_classes;
1456  const double qq = (1.0 - ood) * q;
1457  double gain = 0.0;
1458 
1459  // SMO loop
1460  size_t iter, maxiter = 10 * m_classes;
1461  for (iter=0; iter<maxiter; iter++)
1462  {
1463  // select working set
1464  std::size_t idx = 0;
1465  double kkt = 0.0;
1466  for (std::size_t c=0; c<m_classes; c++)
1467  {
1468  const double g = gradient(c);
1469  const double a = alpha(c);
1470  if (g > kkt && a < C) { kkt = g; idx = c; }
1471  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1472  }
1473 
1474  // check stopping criterion
1475  if (kkt < epsilon) break;
1476 
1477  // perform step
1478  const double a = alpha(idx);
1479  const double g = gradient(idx);
1480  double m = g / qq;
1481  double a_new = a + m;
1482  if (a_new <= 0.0)
1483  {
1484  m = -a;
1485  a_new = 0.0;
1486  }
1487  else if (a_new >= C)
1488  {
1489  m = C - a;
1490  a_new = C;
1491  }
1492  alpha(idx) = a_new;
1493  mu(idx) += m;
1494 
1495  // update gradient and total gain
1496  const double dg = m * q;
1497  const double dgc = dg / m_classes;
1498  if (idx == y)
1499  {
1500  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1501  gradient(idx) -= dg - 2.0 * dgc;
1502  }
1503  else
1504  {
1505  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
1506  gradient(idx) -= dg;
1507  }
1508 
1509  gain += m * (g - 0.5 * (dg - dgc));
1510  }
1511 
1512  return gain;
1513  }
1514 
1515 protected:
1519 };
1520 
1521 
1522 }
1523 #endif