QpBoxLinear.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solver linear SVM training without bias
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date -
12  *
13  *
14  * \par Copyright 1995-2017 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://shark-ml.org/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
38 #define SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
39 
40 #include <shark/Core/Timer.h>
42 #include <shark/Data/Dataset.h>
43 #include <shark/Data/DataView.h>
44 #include <shark/LinAlg/Base.h>
45 #include <cmath>
46 #include <iostream>
47 
48 
49 namespace shark {
50 
51 
52 ///
53 /// \brief Quadratic program solver for box-constrained problems with linear kernel
54 ///
55 /// \par
56 /// The QpBoxLinear class is a decomposition-based solver for linear
57 /// support vector machine classifiers trained with the hinge loss.
58 /// Its optimization is largely based on the paper<br>
59 /// "A Dual Coordinate Descent Method for Large-scale Linear SVM"
60 /// by Hsieh, Chang, and Lin, ICML, 2007.
61 /// However, the present algorithm differs quite a bit, since it
62 /// learns variable preferences as a replacement for the missing
63 /// working set selection. At the same time, this method replaces
64 /// the shrinking heuristic.
65 ///
66 template <class InputT>
68 {
69 public:
72 
73  ///
74  /// \brief Constructor
75  ///
76  /// \param dataset training data
77  /// \param dim problem dimension
78  ///
79  QpBoxLinear(const DatasetType& dataset, std::size_t dim)
80  : m_data(dataset)
81  , m_dim(dim)
82  , m_xSquared(m_data.size())
83  , m_alpha(m_data.size(),0.0)
84  , m_weights(m_dim,0.0)
85  , m_pref(m_data.size(),1.0)
86  , m_offset(0)
87 
88  {
89  SHARK_ASSERT(dim > 0);
90 
91  // pre-compute squared norms
92  for (std::size_t i=0; i<m_data.size(); i++)
93  {
94  auto const& x_i = m_data[i];
95  m_xSquared(i) = norm_sqr(x_i.input);
96  }
97  }
98 
99  void setOffset(double newOffset){
100  m_offset = newOffset;
101  }
102 
103  double offsetGradient()const{
104  double result = 0;
105  for(std::size_t i = 0; i != m_data.size(); ++i){
106  double y_i = (m_data[i].label > 0) ? +1.0 : -1.0;
107  result += m_alpha(i) * y_i;
108  }
109  return result;
110  }
111 
112  RealVector const& solutionWeightVector()const{
113  return m_weights;
114  }
115 
116  ///
117  /// \brief Solve the SVM training problem.
118  ///
119  /// \param bound upper bound for m_alpha-components, complexity parameter of the hinge loss SVM
120  /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\m_alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
121  /// \param stop stopping condition(s)
122  /// \param prop solution properties
123  /// \param verbose if true, the solver prints status information and solution statistics
124  ///
125  void solve(
126  double bound,
127  double reg,
128  QpStoppingCondition& stop,
129  QpSolutionProperties* prop = NULL,
130  bool verbose = false
131  ){
132  // sanity checks
133  SHARK_ASSERT(bound > 0.0);
134  SHARK_ASSERT(reg >= 0.0);
135 
136  // measure training time
137  Timer timer;
138 
139  // prepare dimensions and vectors
140  std::size_t ell = m_data.size();
141  double prefsum = sum(m_pref); // normalization constant for m_pref
142  std::vector<std::size_t> schedule(ell);
143 
144  // prepare counters
145  std::size_t epoch = 0;
146  std::size_t steps = 0;
147 
148  // prepare performance monitoring for self-adaptation
149  double max_violation = 0.0;
150  const double gain_learning_rate = 1.0 / ell;
151  double average_gain = 0.0;
152  bool canstop = true;
153 
154  // outer optimization loop
155  while (true)
156  {
157  // define schedule
158  double psum = prefsum;
159  prefsum = 0.0;
160  std::size_t pos = 0;
161  for (std::size_t i=0; i<ell; i++)
162  {
163  double p = m_pref[i];
164  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
165  std::size_t n = (std::size_t)std::floor(num);
166  double prob = num - n;
167  if (random::coinToss(random::globalRng,prob)) n++;
168  for (std::size_t j=0; j<n; j++)
169  {
170  schedule[pos] = i;
171  pos++;
172  }
173  psum -= p;
174  prefsum += p;
175  }
176  SHARK_ASSERT(pos == ell);
177  std::shuffle(schedule.begin(),schedule.end(),random::globalRng);
178 
179  // inner loop
180  max_violation = 0.0;
181  for (std::size_t j=0; j<ell; j++)
182  {
183  // active variable
184  std::size_t i = schedule[j];
185  auto const& e_i = m_data[i];
186  double y_i = (e_i.label > 0) ? +1.0 : -1.0;
187 
188  // compute gradient and projected gradient
189  double a = m_alpha(i);
190  double wyx = y_i * inner_prod(m_weights, e_i.input);
191  double g = 1.0 - m_offset * y_i - wyx - reg * a;
192  double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
193 
194  // update maximal KKT violation over the epoch
195  max_violation = std::max(max_violation, std::abs(pg));
196  double gain = 0.0;
197 
198  // perform the step
199  if (pg != 0.0)
200  {
201  // SMO-style coordinate descent step
202  double q = m_xSquared(i) + reg;
203  double mu = g / q;
204  double new_a = a + mu;
205 
206  // numerically stable update
207  if (new_a <= 0.0)
208  {
209  mu = -a;
210  new_a = 0.0;
211  }
212  else if (new_a >= bound)
213  {
214  mu = bound - a;
215  new_a = bound;
216  }
217 
218  // update both representations of the weight vector: m_alpha and m_weights
219  m_alpha(i) = new_a;
220  noalias(m_weights) += (mu * y_i) * e_i.input;
221  gain = mu * (g - 0.5 * q * mu);
222 
223  steps++;
224  }
225 
226  // update gain-based preferences
227  {
228  if (epoch == 0) average_gain += gain / (double)ell;
229  else
230  {
231  // strategy constants
232  constexpr double CHANGE_RATE = 0.2;
233  constexpr double PREF_MIN = 0.05;
234  constexpr double PREF_MAX = 20.0;
235 
236  double change = CHANGE_RATE * (gain / average_gain - 1.0);
237  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, m_pref(i) * std::exp(change)));
238  prefsum += newpref - m_pref(i);
239  m_pref[i] = newpref;
240  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
241  }
242  }
243  }
244 
245  epoch++;
246 
247  // stopping criteria
248  if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
249  {
250  if (prop != NULL) prop->type = QpMaxIterationsReached;
251  break;
252  }
253 
254  if (timer.stop() >= stop.maxSeconds)
255  {
256  if (prop != NULL) prop->type = QpTimeout;
257  break;
258  }
259 
260  if (max_violation < stop.minAccuracy)
261  {
262  if (verbose) std::cout << "#" << std::flush;
263  if (canstop)
264  {
265  if (prop != NULL) prop->type = QpAccuracyReached;
266  break;
267  }
268  else
269  {
270  // prepare full sweep for a reliable checking of the stopping criterion
271  canstop = true;
272  for (std::size_t i=0; i<ell; i++) m_pref[i] = 1.0;
273  prefsum = (double)ell;
274  }
275  }
276  else
277  {
278  if (verbose) std::cout << "." << std::flush;
279  canstop = false;
280  }
281  }
282 
283  timer.stop();
284 
285  // compute solution statistics
286  std::size_t free_SV = 0;
287  std::size_t bounded_SV = 0;
288  double objective = -0.5 * norm_sqr(m_weights);
289  for (std::size_t i=0; i<ell; i++)
290  {
291  double a = m_alpha(i);
292  if (a > 0.0)
293  {
294  objective += a;
295  objective -= reg/2.0 * a * a;
296  if (a == bound) bounded_SV++;
297  else free_SV++;
298  }
299  }
300 
301  // return solution statistics
302  if (prop != NULL)
303  {
304  prop->accuracy = max_violation; // this is approximate, but a good guess
305  prop->iterations = ell * epoch;
306  prop->value = objective;
307  prop->seconds = timer.lastLap();
308  }
309 
310  // output solution statistics
311  if (verbose)
312  {
313  std::cout << std::endl;
314  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
315  std::cout << "number of epochs: " << epoch << std::endl;
316  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
317  std::cout << "number of non-zero steps: " << steps << std::endl;
318  std::cout << "dual accuracy: " << max_violation << std::endl;
319  std::cout << "dual objective value: " << objective << std::endl;
320  std::cout << "number of free support vectors: " << free_SV << std::endl;
321  std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
322  }
323  }
324 
325 protected:
326  DataView<const DatasetType> m_data; ///< view on training data
327  std::size_t m_dim; ///< input space dimension
328  RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
329  RealVector m_alpha; ///< storage of the m_alpha values for warm start
330  RealVector m_weights; ///< storage of weight vector for warm start
331  RealVector m_pref; ///< measure of success of individual steps
332  double m_offset;
333 };
334 
335 
336 //~ template < >
337 //~ class QpBoxLinear<CompressedRealVector>
338 //~ {
339 //~ public:
340  //~ typedef LabeledData<CompressedRealVector, unsigned int> DatasetType;
341 
342  //~ ///
343  //~ /// \brief Constructor
344  //~ ///
345  //~ /// \param dataset training data
346  //~ /// \param dim problem dimension
347  //~ ///
348  //~ QpBoxLinear(const DatasetType& dataset, std::size_t dim)
349  //~ : x(dataset.numberOfElements())
350  //~ , y(dataset.numberOfElements())
351  //~ , diagonal(dataset.numberOfElements())
352  //~ , m_dim(dim)
353  //~ {
354  //~ SHARK_ASSERT(dim > 0);
355 
356  //~ // transform ublas sparse vectors into a fast format
357  //~ // (yes, ublas is slow...), and compute the diagonal
358  //~ // elements of the quadratic matrix
359  //~ SparseVector sparse;
360  //~ for (std::size_t b=0, j=0; b<dataset.numberOfBatches(); b++)
361  //~ {
362  //~ DatasetType::const_batch_reference batch = dataset.batch(b);
363  //~ for (std::size_t i=0; i<batch.size(); i++)
364  //~ {
365  //~ auto const& x_i = shark::get(batch, i).input;
366  //~ // if (x_i.nnz() == 0) continue;
367 
368  //~ unsigned int y_i = shark::get(batch, i).label;
369  //~ y[j] = 2.0 * y_i - 1.0;
370  //~ double d = 0.0;
371  //~ for (auto it=x_i.begin(); it != x_i.end(); ++it)
372  //~ {
373  //~ double v = *it;
374  //~ sparse.index = it.index();
375  //~ sparse.value = v;
376  //~ storage.push_back(sparse);
377  //~ d += v * v;
378  //~ }
379  //~ sparse.index = (std::size_t)-1;
380  //~ storage.push_back(sparse);
381  //~ diagonal(j) = d;
382  //~ j++;
383  //~ }
384  //~ }
385  //~ for (std::size_t b=0, j=0, k=0; b<dataset.numberOfBatches(); b++)
386  //~ {
387  //~ DatasetType::const_batch_reference batch = dataset.batch(b);
388  //~ for (std::size_t i=0; i<batch.size(); i++)
389  //~ {
390  //~ auto const& x_i = shark::get(batch, i).input;
391  //~ // if (x_i.nnz() == 0) continue;
392 
393  //~ x[j] = &storage[k]; // cannot be done in the first loop because of vector reallocation
394  //~ for (; storage[k].index != (std::size_t)-1; k++);
395  //~ k++;
396  //~ j++;
397  //~ }
398  //~ }
399  //~ }
400 
401  //~ ///
402  //~ /// \brief Solve the SVM training problem.
403  //~ ///
404  //~ /// \param bound upper bound for m_alpha-components, complexity parameter of the hinge loss SVM
405  //~ /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\m_alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
406  //~ /// \param stop stopping condition(s)
407  //~ /// \param prop solution properties
408  //~ /// \param verbose if true, the solver prints status information and solution statistics
409  //~ ///
410  //~ RealVector solve(
411  //~ double bound,
412  //~ double reg,
413  //~ QpStoppingCondition& stop,
414  //~ QpSolutionProperties* prop = NULL,
415  //~ bool verbose = false)
416  //~ {
417  //~ // sanity checks
418  //~ SHARK_ASSERT(bound > 0.0);
419  //~ SHARK_ASSERT(reg >= 0.0);
420 
421  //~ // measure training time
422  //~ Timer timer;
423  //~ timer.start();
424 
425  //~ // prepare dimensions and vectors
426  //~ std::size_t ell = x.size();
427  //~ RealVector m_alpha(ell, 0.0);
428  //~ RealVector m_weights(m_dim, 0.0);
429  //~ RealVector m_pref(ell, 1.0); // measure of success of individual steps
430  //~ double prefsum = ell; // normalization constant
431  //~ std::vector<std::size_t> schedule(ell);
432 
433  //~ // prepare counters
434  //~ std::size_t epoch = 0;
435  //~ std::size_t steps = 0;
436 
437  //~ // prepare performance monitoring for self-adaptation
438  //~ double max_violation = 0.0;
439  //~ const double gain_learning_rate = 1.0 / ell;
440  //~ double average_gain = 0.0;
441  //~ bool canstop = true;
442 
443  //~ // outer optimization loop
444  //~ while (true)
445  //~ {
446  //~ // define schedule
447  //~ double psum = prefsum;
448  //~ prefsum = 0.0;
449  //~ std::size_t pos = 0;
450  //~ for (std::size_t i=0; i<ell; i++)
451  //~ {
452  //~ double p = m_pref[i];
453  //~ double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
454  //~ std::size_t n = (std::size_t)std::floor(num);
455  //~ double prob = num - n;
456  //~ if (random::uni() < prob) n++;
457  //~ for (std::size_t j=0; j<n; j++)
458  //~ {
459  //~ schedule[pos] = i;
460  //~ pos++;
461  //~ }
462  //~ psum -= p;
463  //~ prefsum += p;
464  //~ }
465  //~ SHARK_ASSERT(pos == ell);
466  //~ for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[random::discrete(0, ell - 1)]);
467 
468  //~ // inner loop
469  //~ max_violation = 0.0;
470  //~ for (std::size_t j=0; j<ell; j++)
471  //~ {
472  //~ // active variable
473  //~ std::size_t i = schedule[j];
474  //~ const SparseVector* x_i = x[i];
475 
476  //~ // compute gradient and projected gradient
477  //~ double a = m_alpha(i);
478  //~ double wyx = y(i) * inner_prod(m_weights, x_i);
479  //~ double g = 1.0 - wyx - reg * a;
480  //~ double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
481 
482  //~ // update maximal KKT violation over the epoch
483  //~ max_violation = std::max(max_violation, std::abs(pg));
484  //~ double gain = 0.0;
485 
486  //~ // perform the step
487  //~ if (pg != 0.0)
488  //~ {
489  //~ // SMO-style coordinate descent step
490  //~ double q = diagonal(i) + reg;
491  //~ double mu = g / q;
492  //~ double new_a = a + mu;
493 
494  //~ // numerically stable update
495  //~ if (new_a <= 0.0)
496  //~ {
497  //~ mu = -a;
498  //~ new_a = 0.0;
499  //~ }
500  //~ else if (new_a >= bound)
501  //~ {
502  //~ mu = bound - a;
503  //~ new_a = bound;
504  //~ }
505 
506  //~ // update both representations of the weight vector: m_alpha and m_weights
507  //~ m_alpha(i) = new_a;
508  //~ // m_weights += (mu * y(i)) * x_i;
509  //~ axpy(m_weights, mu * y(i), x_i);
510  //~ gain = mu * (g - 0.5 * q * mu);
511 
512  //~ steps++;
513  //~ }
514 
515  //~ // update gain-based preferences
516  //~ {
517  //~ if (epoch == 0) average_gain += gain / (double)ell;
518  //~ else
519  //~ {
520  //~ // strategy constants
521  //~ constexpr double CHANGE_RATE = 0.2;
522  //~ constexpr double PREF_MIN = 0.05;
523  //~ constexpr double PREF_MAX = 20.0;
524 
525  //~ double change = CHANGE_RATE * (gain / average_gain - 1.0);
526  //~ double newpref = std::min(PREF_MAX, std::max(PREF_MIN, m_pref(i) * std::exp(change)));
527  //~ prefsum += newpref - m_pref(i);
528  //~ m_pref[i] = newpref;
529  //~ average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
530  //~ }
531  //~ }
532  //~ }
533 
534  //~ epoch++;
535 
536  //~ // stopping criteria
537  //~ if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
538  //~ {
539  //~ if (prop != NULL) prop->type = QpMaxIterationsReached;
540  //~ break;
541  //~ }
542 
543  //~ if (timer.stop() >= stop.maxSeconds)
544  //~ {
545  //~ if (prop != NULL) prop->type = QpTimeout;
546  //~ break;
547  //~ }
548 
549  //~ if (max_violation < stop.minAccuracy)
550  //~ {
551  //~ if (verbose) std::cout << "#" << std::flush;
552  //~ if (canstop)
553  //~ {
554  //~ if (prop != NULL) prop->type = QpAccuracyReached;
555  //~ break;
556  //~ }
557  //~ else
558  //~ {
559  //~ // prepare full sweep for a reliable checking of the stopping criterion
560  //~ canstop = true;
561  //~ for (std::size_t i=0; i<ell; i++) m_pref[i] = 1.0;
562  //~ prefsum = ell;
563  //~ }
564  //~ }
565  //~ else
566  //~ {
567  //~ if (verbose) std::cout << "." << std::flush;
568  //~ canstop = false;
569  //~ }
570  //~ }
571 
572  //~ timer.stop();
573 
574  //~ // compute solution statistics
575  //~ std::size_t free_SV = 0;
576  //~ std::size_t bounded_SV = 0;
577  //~ double objective = -0.5 * shark::blas::inner_prod(m_weights, m_weights);
578  //~ for (std::size_t i=0; i<ell; i++)
579  //~ {
580  //~ double a = m_alpha(i);
581  //~ if (a > 0.0)
582  //~ {
583  //~ objective += a;
584  //~ objective -= reg/2.0 * a * a;
585  //~ if (a == bound) bounded_SV++;
586  //~ else free_SV++;
587  //~ }
588  //~ }
589 
590  //~ // return solution statistics
591  //~ if (prop != NULL)
592  //~ {
593  //~ prop->accuracy = max_violation; // this is approximate, but a good guess
594  //~ prop->iterations = ell * epoch;
595  //~ prop->value = objective;
596  //~ prop->seconds = timer.lastLap();
597  //~ }
598 
599  //~ // output solution statistics
600  //~ if (verbose)
601  //~ {
602  //~ std::cout << std::endl;
603  //~ std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
604  //~ std::cout << "number of epochs: " << epoch << std::endl;
605  //~ std::cout << "number of iterations: " << (ell * epoch) << std::endl;
606  //~ std::cout << "number of non-zero steps: " << steps << std::endl;
607  //~ std::cout << "dual accuracy: " << max_violation << std::endl;
608  //~ std::cout << "dual objective value: " << objective << std::endl;
609  //~ std::cout << "number of free support vectors: " << free_SV << std::endl;
610  //~ std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
611  //~ }
612 
613  //~ // return the solution
614  //~ return m_weights;
615  //~ }
616 
617 //~ protected:
618  //~ /// \brief Data structure for sparse vectors.
619  //~ struct SparseVector
620  //~ {
621  //~ std::size_t index;
622  //~ double value;
623  //~ };
624 
625  //~ /// \brief Famous "axpy" product, here adding a multiple of a sparse vector to a dense one.
626  //~ static inline void axpy(RealVector& m_weights, double m_alpha, const SparseVector* xi)
627  //~ {
628  //~ while (true)
629  //~ {
630  //~ if (xi->index == (std::size_t)-1) return;
631  //~ m_weights[xi->index] += m_alpha * xi->value;
632  //~ xi++;
633  //~ }
634  //~ }
635 
636  //~ /// \brief Inner product between a dense and a sparse vector.
637  //~ static inline double inner_prod(RealVector const& m_weights, const SparseVector* xi)
638  //~ {
639  //~ double ret = 0.0;
640  //~ while (true)
641  //~ {
642  //~ if (xi->index == (std::size_t)-1) return ret;
643  //~ ret += m_weights[xi->index] * xi->value;
644  //~ xi++;
645  //~ }
646  //~ }
647 
648  //~ std::vector<SparseVector> storage; ///< storage for sparse vectors
649  //~ std::vector<SparseVector*> x; ///< sparse vectors
650  //~ RealVector y; ///< +1/-1 labels
651  //~ RealVector diagonal; ///< diagonal entries of the quadratic matrix
652  //~ std::size_t m_dim; ///< input space dimension
653 //~ };
654 
655 
656 }
657 #endif