CSvmTrainer.h
Go to the documentation of this file.
1 #ifndef SHARK_ALGORITHMS_CSVMTRAINER_H
2 #define SHARK_ALGORITHMS_CSVMTRAINER_H
3
4
5
16
17 //for MCSVMs!
21 //~ #include <shark/Algorithms/Trainers/McSvm/McSvmMMRTrainer.h>
22 //~ #include <shark/Algorithms/Trainers/McSvm/McReinforcedSvmTrainer.h>
23
24 namespace shark {
25
26
27 enum class McSvm{
28  WW,
29  CS,
30  LLW,
31  ATM,
32  ATS,
34  OVA,
35  MMR,
37 };
38
39
40 ///
41 /// \brief Training of C-SVMs for binary classification.
42 ///
43 /// The C-SVM is the "standard" support vector machine for
44 /// binary (two-class) classification. Given are data tuples
45 /// \f$(x_i, y_i) \f$ with x-component denoting input and
46 /// y-component denoting the label +1 or -1 (see the tutorial on
47 /// label conventions; the implementation uses values 0/1),
48 /// a kernel function k(x, x') and a regularization
49 /// constant C > 0. Let H denote the kernel induced
50 /// reproducing kernel Hilbert space of k, and let \f$\phi \f$
51 /// denote the corresponding feature map.
52 /// Then the SVM classifier is the function
53 /// \f[
54 /// h(x) = \mathop{sign} (f(x))
55 /// \f]
56 /// \f[
57 /// f(x) = \langle w, \phi(x) \rangle + b
58 /// \f]
59 /// with coefficients w and b given by the (primal)
60 /// optimization problem
61 /// \f[
62 /// \min \frac{1}{2} \|w\|^2 + C \sum_i L(y_i, f(x_i)),
63 /// \f]
64 /// where \f$L(y, f(x)) = \max\{0, 1 - y \cdot f(x)\} \f$
65 /// denotes the hinge loss.
66 ///
67 /// For details refer to the paper:<br/>
68 /// <p>Support-Vector Networks. Corinna Cortes and Vladimir Vapnik,
69 /// Machine Learning, vol. 20 (1995), pp. 273-297.</p>
70 /// or simply to the Wikipedia article:<br/>
71 /// http://en.wikipedia.org/wiki/Support_vector_machine
72 ///
73 template <class InputType, class CacheType = float>
75  InputType, unsigned int,
76  KernelClassifier<InputType>,
77  AbstractWeightedTrainer<KernelClassifier<InputType> >
78 >
79 {
80 private:
81  typedef AbstractSvmTrainer<
82  InputType, unsigned int,
85  > base_type;
86 public:
87
88  /// \brief Convenience typedefs:
89  /// this and many of the below typedefs build on the class template type CacheType.
90  /// Simply changing that one template parameter CacheType thus allows to flexibly
91  /// switch between using float or double as type for caching the kernel values.
92  /// The default is float, offering sufficient accuracy in the vast majority
93  /// of cases, at a memory cost of only four bytes. However, the template
94  /// parameter makes it easy to use double instead, (e.g., in case high
95  /// accuracy training is needed).
96  typedef CacheType QpFloatType;
97
99
100  //! Constructor
101  //! \param kernel kernel function to use for training and prediction
102  //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
103  //! \param offset whether to train the svm with offset term
104  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
105  CSvmTrainer(KernelType* kernel, double C, bool offset, bool unconstrained = false)
106  : base_type(kernel, C, offset, unconstrained), m_computeDerivative(false), m_McSvmType(McSvm::WW) //make Vapnik happy!
107  { }
108
109  //! Constructor
110  //! \param kernel kernel function to use for training and prediction
111  //! \param negativeC regularization parameter of the negative class (label 0)
112  //! \param positiveC regularization parameter of the positive class (label 1)
113  //! \param offset whether to train the svm with offset term
114  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
115  CSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool offset, bool unconstrained = false)
116  : base_type(kernel,negativeC, positiveC, offset, unconstrained), m_computeDerivative(false), m_McSvmType(McSvm::WW) //make Vapnik happy!
117  { }
118
119  /// \brief From INameable: return the class name.
120  std::string name() const
121  { return "CSvmTrainer"; }
122
123  void setComputeBinaryDerivative(bool compute){
124  m_computeDerivative = compute;
125  }
126
127  /// \brief sets the type of the multi-class svm used
128  void setMcSvmType(McSvm type){
129  m_McSvmType = type;
130  }
131
132
133  /// \brief Train the C-SVM.
134  void train(KernelClassifier<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset)
135  {
136  std::size_t classes = numberOfClasses(dataset);
137  std::size_t ell = dataset.numberOfElements();
138  if(classes == 2){
139  // prepare model
140
141  auto& f = svm.decisionFunction();
142  if (f.basis() == dataset.inputs() && f.kernel() == base_type::m_kernel && f.alpha().size1() == ell && f.alpha().size2() == 1) {
143  // warm start, keep the alphas (possibly clipped)
144  if (this->m_trainOffset) f.offset() = RealVector(1);
145  }
146  else {
147  f.setStructure(base_type::m_kernel, dataset.inputs(), this->m_trainOffset);
148  }
149
150  //dispatch to use the optimal implementation and solve the problem
151  trainBinary(f,dataset);
152
153  if (base_type::sparsify())
154  f.sparsify();
155  return;
156  }
157
158  //special case OVA
159  if(m_McSvmType == McSvm::OVA){
160  trainOVA(svm,dataset);
161  return;
162  }
163
164  //general multiclass case: find correct dual formulation
165  bool sumToZero = false;
166  bool simplex = false;
169
170  switch (m_McSvmType){
171  case McSvm::WW:
172  sumToZero = false;
173  simplex = false;
174  setupMcParametersWWCS(nu,M, classes);
175  break;
176  case McSvm::CS:
177  sumToZero = false;
178  simplex=true;
179  setupMcParametersWWCS(nu,M, classes);
180  break;
181  case McSvm::LLW:
182  sumToZero=true;
183  simplex = false;
185  break;
186  case McSvm::ATM:
187  sumToZero=true;
188  simplex=true;
189  setupMcParametersATMATS(nu,M, classes);
190  break;
191  case McSvm::ATS:
192  sumToZero=true;
193  simplex = false;
194  setupMcParametersATMATS(nu,M, classes);
195  break;
197  sumToZero=true;
198  simplex=true;
200  break;
202  sumToZero = false;
203  simplex = false;
204  setupMcParametersATMATS(nu,M, classes);
205  break;
206  case McSvm::MMR:
207  sumToZero = true;
208  simplex = true;
209  setupMcParametersMMR(nu,M, classes);
210  break;
211  case McSvm::OVA: // handle OVA is switch statement to silence compiler warning
212  break;
213  }
214
215  //setup linear part
216  RealMatrix linear(ell,M.width(),1.0);
217  if(m_McSvmType == McSvm::ReinforcedSvm){
218  auto const& labels = dataset.labels();
219  std::size_t i=0;
220  for(unsigned int y: labels.elements()){
221  linear(i, y) = classes - 1.0; // self-margin target value of reinforced SVM loss
222  i++;
223  }
224  }
225
226  //solve dual
227  RealMatrix alpha(ell,M.width(),0.0);
228  RealVector bias(classes,0.0);
229  if(simplex)
230  solveMcSimplex(sumToZero,nu,M,linear,alpha,bias,dataset);
231  else
232  solveMcBox(sumToZero,nu,M,linear,alpha,bias,dataset);
233
234
235  // write the solution into the model
236  svm.decisionFunction().setStructure(this->m_kernel,dataset.inputs(),this->m_trainOffset,classes);
237
238  for (std::size_t i=0; i<ell; i++)
239  {
240  unsigned int y = dataset.element(i).label;
241  for (std::size_t c=0; c<classes; c++)
242  {
243  double sum = 0.0;
244  std::size_t r = alpha.size2() * y;
245  for (std::size_t p=0; p != alpha.size2(); p++, r++)
246  sum += nu(r, c) * alpha(i, p);
247  svm.decisionFunction().alpha(i,c) = sum;
248  }
249  }
250
251  if (this->m_trainOffset)
252  svm.decisionFunction().offset() = bias;
253
254  if (this->sparsify())
255  svm.decisionFunction().sparsify();
256  }
257
258  /// \brief Train the C-SVM using weights.
259  void train(KernelClassifier<InputType>& svm, WeightedLabeledData<InputType, unsigned int> const& dataset){
260  SHARK_RUNTIME_CHECK(numberOfClasses(dataset) == 2, "CSVM with weights is only implemented for binary problems");
261  // prepare model
262  std::size_t n = dataset.numberOfElements();
263  auto& f = svm.decisionFunction();
264  if (f.basis() == dataset.inputs() && f.kernel() == base_type::m_kernel && f.alpha().size1() == n && f.alpha().size2() == 1) {
265  // warm start, keep the alphas
266  if (this->m_trainOffset) f.offset() = RealVector(1);
267  else f.offset() = RealVector();
268  }
269  else {
270  f.setStructure(base_type::m_kernel, dataset.inputs(), this->m_trainOffset);
271  }
272
273  //dispatch to use the optimal implementation and solve the problem
274  trainBinary(f, dataset);
275
276  if (base_type::sparsify()) f.sparsify();
277  }
278
279  RealVector const& get_db_dParams()const{
280  return m_db_dParams;
281  }
282
283 private:
284
285  void solveMcSimplex(
286  bool sumToZero, QpSparseArray<QpFloatType> const& nu,QpSparseArray<QpFloatType> const& M, RealMatrix const& linear,
287  RealMatrix& alpha, RealVector& bias,
289  ){
290  typedef KernelMatrix<InputType, QpFloatType> KernelMatrixType;
291  typedef CachedMatrix< KernelMatrixType > CachedMatrixType;
292  typedef PrecomputedMatrix< KernelMatrixType > PrecomputedMatrixType;
293
294  KernelMatrixType km(*base_type::m_kernel, dataset.inputs());
295  // solve the problem
296  if (base_type::precomputeKernel())
297  {
298  PrecomputedMatrixType matrix(&km);
299  QpMcSimplexDecomp< PrecomputedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
300  QpSolutionProperties& prop = base_type::m_solutionproperties;
301  problem.setShrinking(base_type::m_shrinking);
302  if(this->m_trainOffset){
303  BiasSolverSimplex<PrecomputedMatrixType> biasSolver(&problem);
304  biasSolver.solve(bias,base_type::m_stoppingcondition,nu,sumToZero, &prop);
305  }
306  else{
308  solver.solve( base_type::m_stoppingcondition, &prop);
309  }
310  alpha = problem.solution();
311  }
312  else
313  {
314  CachedMatrixType matrix(&km, base_type::m_cacheSize);
315  QpMcSimplexDecomp< CachedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
316  QpSolutionProperties& prop = base_type::m_solutionproperties;
317  problem.setShrinking(base_type::m_shrinking);
318  if(this->m_trainOffset){
319  BiasSolverSimplex<CachedMatrixType> biasSolver(&problem);
320  biasSolver.solve(bias,base_type::m_stoppingcondition,nu,sumToZero, &prop);
321  }
322  else{
324  solver.solve( base_type::m_stoppingcondition, &prop);
325  }
326  alpha = problem.solution();
327  }
328  base_type::m_accessCount = km.getAccessCount();
329  }
330
331  void solveMcBox(
332  bool sumToZero, QpSparseArray<QpFloatType> const& nu,QpSparseArray<QpFloatType> const& M, RealMatrix const& linear,
333  RealMatrix& alpha, RealVector& bias,
335  ){
336  typedef KernelMatrix<InputType, QpFloatType> KernelMatrixType;
337  typedef CachedMatrix< KernelMatrixType > CachedMatrixType;
338  typedef PrecomputedMatrix< KernelMatrixType > PrecomputedMatrixType;
339
340  KernelMatrixType km(*base_type::m_kernel, dataset.inputs());
341  // solve the problem
342  if (base_type::precomputeKernel())
343  {
344  PrecomputedMatrixType matrix(&km);
345  QpMcBoxDecomp< PrecomputedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
346  QpSolutionProperties& prop = base_type::m_solutionproperties;
347  problem.setShrinking(base_type::m_shrinking);
348  if(this->m_trainOffset){
349  BiasSolver<PrecomputedMatrixType> biasSolver(&problem);
350  biasSolver.solve(bias,base_type::m_stoppingcondition,nu, sumToZero, &prop);
351  }
352  else{
354  solver.solve( base_type::m_stoppingcondition, &prop);
355  }
356  alpha = problem.solution();
357  }
358  else
359  {
360  CachedMatrixType matrix(&km, base_type::m_cacheSize);
361  QpMcBoxDecomp< CachedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
362  QpSolutionProperties& prop = base_type::m_solutionproperties;
363  problem.setShrinking(base_type::m_shrinking);
364  if(this->m_trainOffset){
365  BiasSolver<CachedMatrixType> biasSolver(&problem);
366  biasSolver.solve(bias,base_type::m_stoppingcondition,nu, sumToZero, &prop);
367  }
368  else{
370  solver.solve( base_type::m_stoppingcondition, &prop);
371  }
372  alpha = problem.solution();
373  }
374  base_type::m_accessCount = km.getAccessCount();
375  }
376
377  template<class Trainer>
378  void trainMc(KernelClassifier<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset){
379  Trainer trainer(base_type::m_kernel,this->C(),this->m_trainOffset);
380  trainer.stoppingCondition() = this->stoppingCondition();
381  trainer.precomputeKernel() = this->precomputeKernel();
382  trainer.sparsify() = this->sparsify();
383  trainer.shrinking() = this->shrinking();
384  trainer.s2do() = this->s2do();
385  trainer.verbosity() = this->verbosity();
386  trainer.setCacheSize(this->cacheSize());
387  trainer.train(svm,dataset);
388  this->solutionProperties() = trainer.solutionProperties();
389  base_type::m_accessCount = trainer.accessCount();
390  }
391
392  void setupMcParametersWWCS(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
393  nu.resize(classes * (classes-1), classes, 2*classes*(classes-1));
394  for (unsigned int r=0, y=0; y<classes; y++)
395  {
396  for (unsigned int p=0, pp=0; p<classes-1; p++, pp++, r++)
397  {
398  if (pp == y) pp++;
399  if (y < pp)
400  {
403  }
404  else
405  {
408  }
409  }
410  }
411
412  M.resize(classes * (classes-1) * classes, classes-1, 2 * classes * (classes-1) * (classes-1));
413  for (unsigned int r=0, yv=0; yv<classes; yv++)
414  {
415  for (unsigned int pv=0, ppv=0; pv<classes-1; pv++, ppv++)
416  {
417  if (ppv == yv) ppv++;
418  for (unsigned int yw=0; yw<classes; yw++, r++)
419  {
420  QpFloatType baseM = (yv == yw ? (QpFloatType)0.25 : (QpFloatType)0.0) - (ppv == yw ? (QpFloatType)0.25 : (QpFloatType)0.0);
421  M.setDefaultValue(r, baseM);
422  if (yv == yw)
423  {
424  M.add(r, ppv - (ppv >= yw ? 1 : 0), baseM + (QpFloatType)0.25);
425  }
426  else if (ppv == yw)
427  {
428  M.add(r, yv - (yv >= yw ? 1 : 0), baseM - (QpFloatType)0.25);
429  }
430  else
431  {
432  unsigned int pw = ppv - (ppv >= yw ? 1 : 0);
433  unsigned int pw2 = yv - (yv >= yw ? 1 : 0);
434  if (pw < pw2)
435  {
436  M.add(r, pw, baseM + (QpFloatType)0.25);
437  M.add(r, pw2, baseM - (QpFloatType)0.25);
438  }
439  else
440  {
441  M.add(r, pw2, baseM - (QpFloatType)0.25);
442  M.add(r, pw, baseM + (QpFloatType)0.25);
443  }
444  }
445  }
446  }
447  }
448  }
449
450  void setupMcParametersATMATS(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
451  nu.resize(classes*classes, classes, classes*classes);
452  for (unsigned int r=0, y=0; y<classes; y++)
453  {
454  for (unsigned int p=0; p<classes; p++, r++)
455  {
456  nu.add(r, p, (QpFloatType)((p == y) ? 1.0 : -1.0));
457  }
458  }
459
460  M.resize(classes * classes * classes, classes, 2 * classes * classes * classes);
461  QpFloatType c_ne = (QpFloatType)(-1.0 / (double)classes);
462  QpFloatType c_eq = (QpFloatType)1.0 + c_ne;
463  for (unsigned int r=0, yv=0; yv<classes; yv++)
464  {
465  for (unsigned int pv=0; pv<classes; pv++)
466  {
467  QpFloatType sign = QpFloatType((yv == pv) ? -1 : 1);//cast to keep MSVC happy...
468  for (unsigned int yw=0; yw<classes; yw++, r++)
469  {
470  M.setDefaultValue(r, sign * c_ne);
471  if (yw == pv)
472  {
473  M.add(r, pv, -sign * c_eq);
474  }
475  else
476  {
477  M.add(r, pv, sign * c_eq);
478  M.add(r, yw, -sign * c_ne);
479  }
480  }
481  }
482  }
483  }
484
485  void setupMcParametersADMLLW(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
486  nu.resize(classes * (classes-1), classes, classes*(classes-1));
487  for (unsigned int r=0, y=0; y<classes; y++)
488  {
489  for (unsigned int p=0, pp=0; p<classes-1; p++, pp++, r++)
490  {
491  if (pp == y) pp++;
493  }
494  }
495
496  M.resize(classes * (classes-1) * classes, classes-1, classes * (classes-1) * (classes-1));
497  QpFloatType mood = (QpFloatType)(-1.0 / (double)classes);
498  QpFloatType val = (QpFloatType)1.0 + mood;
499  for (unsigned int r=0, yv=0; yv<classes; yv++)
500  {
501  for (unsigned int pv=0, ppv=0; pv<classes-1; pv++, ppv++)
502  {
503  if (ppv == yv) ppv++;
504  for (unsigned int yw=0; yw<classes; yw++, r++)
505  {
506  M.setDefaultValue(r, mood);
507  if (ppv != yw)
508  {
509  unsigned int pw = ppv - (ppv > yw ? 1 : 0);
511  }
512  }
513  }
514  }
515  }
516
517  void setupMcParametersMMR(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
518  nu.resize(classes, classes, classes);
519  for (unsigned int y=0; y<classes; y++)
521
522  M.resize(classes * classes, 1, classes);
523  QpFloatType mood = (QpFloatType)(-1.0 / (double)classes);
524  QpFloatType val = (QpFloatType)1.0 + mood;
525  for (unsigned int r=0, yv=0; yv<classes; yv++)
526  {
527  for (unsigned int yw=0; yw<classes; yw++, r++)
528  {
529  M.setDefaultValue(r, mood);
530  if (yv == yw) M.add(r, 0, val);
531  }
532  }
533  }
534
535  void trainOVA(KernelClassifier<InputType>& svm, const LabeledData<InputType, unsigned int>& dataset){
536  std::size_t classes = numberOfClasses(dataset);
537  svm.decisionFunction().setStructure(this->m_kernel,dataset.inputs(),this->m_trainOffset,classes);
538
539  base_type::m_solutionproperties.type = QpNone;
540  base_type::m_solutionproperties.accuracy = 0.0;
541  base_type::m_solutionproperties.iterations = 0;
542  base_type::m_solutionproperties.value = 0.0;
543  base_type::m_solutionproperties.seconds = 0.0;
544  for (unsigned int c=0; c<classes; c++)
545  {
547  KernelClassifier<InputType> binsvm;
548 // TODO: maybe build the Quadratic programs directly,
549 // in order to profit from cached and
550 // in particular from precomputed kernel
551 // entries!
552  CSvmTrainer<InputType, QpFloatType> bintrainer(base_type::m_kernel, this->C(),this->m_trainOffset);
553  bintrainer.setCacheSize(this->cacheSize());
554  bintrainer.sparsify() = false;
555  bintrainer.stoppingCondition() = base_type::stoppingCondition();
556  bintrainer.precomputeKernel() = base_type::precomputeKernel(); // sub-optimal!
557  bintrainer.shrinking() = base_type::shrinking();
558  bintrainer.s2do() = base_type::s2do();
559  bintrainer.verbosity() = base_type::verbosity();
560  bintrainer.train(binsvm, bindata);
561  base_type::m_solutionproperties.iterations += bintrainer.solutionProperties().iterations;
562  base_type::m_solutionproperties.seconds += bintrainer.solutionProperties().seconds;
563  base_type::m_solutionproperties.accuracy = std::max(base_type::solutionProperties().accuracy, bintrainer.solutionProperties().accuracy);
564  column(svm.decisionFunction().alpha(), c) = column(binsvm.decisionFunction().alpha(), 0);
565  if (this->m_trainOffset)
566  svm.decisionFunction().offset(c) = binsvm.decisionFunction().offset(0);
567  base_type::m_accessCount += bintrainer.accessCount();
568  }
569
570  if (base_type::sparsify())
571  svm.decisionFunction().sparsify();
572  }
573
574  //by default the normal unoptimized kernel matrix is used
575  template<class T, class DatasetTypeT>
576  void trainBinary(KernelExpansion<T>& svm, DatasetTypeT const& dataset){
577  KernelMatrix<T, QpFloatType> km(*base_type::m_kernel, dataset.inputs());
578  trainBinary(km,svm,dataset);
579  }
580
581  //in the case of a gaussian kernel and sparse vectors, we can use an optimized approach
582  template<class T, class DatasetTypeT>
583  void trainBinary(KernelExpansion<CompressedRealVector>& svm, DatasetTypeT const& dataset){
584  //check whether a gaussian kernel is used
586  Gaussian const* kernel = dynamic_cast<Gaussian const*> (base_type::m_kernel);
587  if(kernel != 0){//jep, use optimized kernel matrix
588  GaussianKernelMatrix<CompressedRealVector,QpFloatType> km(kernel->gamma(),dataset.inputs());
589  trainBinary(km,svm,dataset);
590  }
591  else{
592  KernelMatrix<CompressedRealVector, QpFloatType> km(*base_type::m_kernel, dataset.inputs());
593  trainBinary(km,svm,dataset);
594  }
595  }
596
597  //create the problem for the unweighted datasets
598  template<class Matrix, class T>
599  void trainBinary(Matrix& km, KernelExpansion<T>& svm, LabeledData<T, unsigned int> const& dataset){
601  {
602  PrecomputedMatrix<Matrix> matrix(&km);
603  CSVMProblem<PrecomputedMatrix<Matrix> > svmProblem(matrix,dataset.labels(),base_type::m_regularizers);
604  optimize(svm,svmProblem,dataset);
605  }
606  else
607  {
608  CachedMatrix<Matrix> matrix(&km);
609  CSVMProblem<CachedMatrix<Matrix> > svmProblem(matrix,dataset.labels(),base_type::m_regularizers);
610  optimize(svm,svmProblem,dataset);
611  }
612  base_type::m_accessCount = km.getAccessCount();
613  }
614
615  // create the problem for the weighted datasets
616  template<class Matrix, class T>
617  void trainBinary(Matrix& km, KernelExpansion<T>& svm, WeightedLabeledData<T, unsigned int> const& dataset){
619  {
620  PrecomputedMatrix<Matrix> matrix(&km);
622  matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
623  );
624  optimize(svm,svmProblem,dataset.data());
625  }
626  else
627  {
628  CachedMatrix<Matrix> matrix(&km);
630  matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
631  );
632  optimize(svm,svmProblem,dataset.data());
633  }
634  base_type::m_accessCount = km.getAccessCount();
635  }
636
637  template<class SVMProblemType>
638  void optimize(KernelExpansion<InputType>& svm, SVMProblemType& svmProblem, LabeledData<InputType, unsigned int> const& dataset){
639  if (this->m_trainOffset)
640  {
641  typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
642  ProblemType problem(svmProblem,base_type::m_shrinking);
643  QpSolver< ProblemType > solver(problem);
644  // truncate the existing solution to the bounds
645  RealVector const& reg = this->regularizationParameters();
646  double C_minus = reg(0);
647  double C_plus = (reg.size() == 1) ? reg(0) : reg(1);
648  std::size_t i=0;
649  for (auto label : dataset.labels().elements()) {
650  double a = svm.alpha()(i, 0);
651  if (label == 0) a = std::max(std::min(a, 0.0), -C_minus);
652  else a = std::min(std::max(a, 0.0), C_plus);
653  svm.alpha()(i, 0) = a;
654  i++;
655  }
656  problem.setInitialSolution(blas::column(svm.alpha(), 0));
657  solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
658  column(svm.alpha(),0)= problem.getUnpermutedAlpha();
659  svm.offset(0) = computeBias(problem,dataset);
660  }
661  else
662  {
664  ProblemType problem(svmProblem,base_type::m_shrinking);
665  QpSolver< ProblemType> solver(problem);
666  // truncate the existing solution to the bounds
667  RealVector const& reg = this->regularizationParameters();
668  double C_minus = reg(0);
669  double C_plus = (reg.size() == 1) ? reg(0) : reg(1);
670  std::size_t i=0;
671  for (auto label : dataset.labels().elements()) {
672  double a = svm.alpha()(i, 0);
673  if (label == 0) a = std::max(std::min(a, 0.0), -C_minus);
674  else a = std::min(std::max(a, 0.0), C_plus);
675  svm.alpha()(i, 0) = a;
676  i++;
677  }
678  problem.setInitialSolution(blas::column(svm.alpha(), 0));
679  solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
680  column(svm.alpha(),0) = problem.getUnpermutedAlpha();
681  }
682  }
683
684  RealVector m_db_dParams; ///< in the rare case that there are only bounded SVs and no free SVs, this will hold the derivative of b w.r.t. the hyperparameters. Derivative w.r.t. C is last.
685
686  bool m_computeDerivative;
687  McSvm m_McSvmType;
688
689  template<class Problem>
690  double computeBias(Problem const& problem, LabeledData<InputType, unsigned int> const& dataset){
691  std::size_t nkp = base_type::m_kernel->numberOfParameters();
692  m_db_dParams.resize(nkp+1);
693  m_db_dParams.clear();
694
695  std::size_t ell = problem.dimensions();
696  if (ell == 0) return 0.0;
697
698  // compute the offset from the KKT conditions
699  double lowerBound = -1e100;
700  double upperBound = 1e100;
701  double sum = 0.0;
702  std::size_t freeVars = 0;
703  std::size_t lower_i = 0;
704  std::size_t upper_i = 0;
705  for (std::size_t i=0; i<ell; i++)
706  {
708  if (problem.alpha(i) == problem.boxMin(i))
709  {
710  if (value > lowerBound) { //in case of no free SVs, we are looking for the largest gradient of all alphas at the lower bound
711  lowerBound = value;
712  lower_i = i;
713  }
714  }
715  else if (problem.alpha(i) == problem.boxMax(i))
716  {
717  if (value < upperBound) { //in case of no free SVs, we are looking for the smallest gradient of all alphas at the upper bound
718  upperBound = value;
719  upper_i = i;
720  }
721  }
722  else
723  {
724  sum += value;
725  freeVars++;
726  }
727  }
728  if (freeVars > 0)
729  return sum / freeVars; //stabilized (averaged) exact value
730
731  if(!m_computeDerivative)
732  return 0.5 * (lowerBound + upperBound); //best estimate
733
734  lower_i = problem.permutation(lower_i);
735  upper_i = problem.permutation(upper_i);
736
737  SHARK_RUNTIME_CHECK(base_type::m_regularizers.size() == 1, "derivative only implemented for SVM with one C" );
738
739  // We next compute the derivative of lowerBound and upperBound wrt C, in order to then get that of b wrt C.
740  // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
741  double dlower_dC = 0.0;
742  double dupper_dC = 0.0;
743  // At the same time, we also compute the derivative of lowerBound and upperBound wrt the kernel parameters.
744  // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
745  RealVector dupper_dkernel( nkp,0 );
746  RealVector dlower_dkernel( nkp,0 );
747  //state for eval and evalDerivative of the kernel
748  boost::shared_ptr<State> kernelState = base_type::m_kernel->createState();
749  RealVector der(nkp ); //derivative storage helper
750  //todo: O.K.: here kernel single input derivative would be usefull
751  //also it can be usefull to use here real batch processing and use batches of size 1 for lower /upper
752  //and instead of singleInput whole batches.
753  //what we do is, that we use the batched input versions with batches of size one.
754  typename Batch<InputType>::type singleInput = Batch<InputType>::createBatch( dataset.element(0).input, 1 );
755  typename Batch<InputType>::type lowerInput = Batch<InputType>::createBatch( dataset.element(lower_i).input, 1 );
756  typename Batch<InputType>::type upperInput = Batch<InputType>::createBatch( dataset.element(upper_i).input, 1 );
757  getBatchElement( lowerInput, 0 ) = dataset.element(lower_i).input; //copy the current input into the batch
758  getBatchElement( upperInput, 0 ) = dataset.element(upper_i).input; //copy the current input into the batch
759  RealMatrix one(1,1,1); //weight of input
760  RealMatrix result(1,1); //stores the result of the call
761
762  for (std::size_t i=0; i<ell; i++) {
763  double cur_alpha = problem.alpha(problem.permutation(i));
764  if ( cur_alpha != 0 ) {
765  int cur_label = ( cur_alpha>0.0 ? 1 : -1 );
766  getBatchElement( singleInput, 0 ) = dataset.element(i).input; //copy the current input into the batch
767  // treat contributions of largest gradient at lower bound
768  base_type::m_kernel->eval( lowerInput, singleInput, result, *kernelState );
769  dlower_dC += cur_label * result(0,0);
770  base_type::m_kernel->weightedParameterDerivative( lowerInput, singleInput,one, *kernelState, der );
771  for ( std::size_t k=0; k<nkp; k++ ) {
772  dlower_dkernel(k) += cur_label * der(k);
773  }
774  // treat contributions of smallest gradient at upper bound
775  base_type::m_kernel->eval( upperInput, singleInput,result, *kernelState );
776  dupper_dC += cur_label * result(0,0);
777  base_type::m_kernel->weightedParameterDerivative( upperInput, singleInput, one, *kernelState, der );
778  for ( std::size_t k=0; k<nkp; k++ ) {
779  dupper_dkernel(k) += cur_label * der(k);
780  }
781  }
782  }
783  // assign final values to derivative of b wrt hyperparameters
784  m_db_dParams( nkp ) = -0.5 * ( dlower_dC + dupper_dC );
785  for ( std::size_t k=0; k<nkp; k++ ) {
786  m_db_dParams(k) = -0.5 * this->C() * ( dlower_dkernel(k) + dupper_dkernel(k) );
787  }
788  if ( base_type::m_unconstrained ) {
789  m_db_dParams( nkp ) *= this->C();
790  }
791
792  return 0.5 * (lowerBound + upperBound); //best estimate
793  }
794 };
795
796
797 template <class InputType>
799 {
800 public:
802
803  LinearCSvmTrainer(double C, bool offset, bool unconstrained = false)
804  : AbstractLinearSvmTrainer<InputType>(C, offset, unconstrained){}
805
806  /// \brief From INameable: return the class name.
807  std::string name() const
808  { return "LinearCSvmTrainer"; }
809
810  /// \brief sets the type of the multi-class svm used
811  void setMcSvmType(McSvm type){
812  m_McSvmType = type;
813  }
814
816  {
817  std::size_t classes = numberOfClasses(dataset);
818  if(classes == 2){
819  trainBinary(model,dataset);
820  return;
821  }
822  switch (m_McSvmType){
823  case McSvm::WW:
824  trainMc<QpMcLinearWW<InputType> >(model,dataset,classes);
825  break;
826  case McSvm::CS:
827  trainMc<QpMcLinearCS<InputType> >(model,dataset,classes);
828  break;
829  case McSvm::LLW:
830  trainMc<QpMcLinearLLW<InputType> >(model,dataset,classes);
831  break;
832  case McSvm::ATM:
833  trainMc<QpMcLinearATM<InputType> >(model,dataset,classes);
834  break;
835  case McSvm::ATS:
836  trainMc<QpMcLinearATS<InputType> >(model,dataset,classes);
837  break;
840  break;
841  case McSvm::MMR:
842  trainMc<QpMcLinearMMR<InputType> >(model,dataset,classes);
843  break;
845  trainMc<QpMcLinearReinforced<InputType> >(model,dataset,classes);
846  break;
847  case McSvm::OVA://OVA is a special case and implemented here
848  trainOVA(model,dataset,classes);
849  break;
850  }
851  }
852 private:
853  McSvm m_McSvmType;
854
855  void trainBinary(LinearClassifier<InputType>& model, LabeledData<InputType, unsigned int> const& dataset)
856  {
857  std::size_t dim = inputDimension(dataset);
858  QpBoxLinear<InputType> solver(dataset, dim);
859  solver.solve(
860  base_type::C(),
861  0.0,
864  QpConfig::verbosity() > 0);
865
866  if(!this->trainOffset()){
867  RealMatrix w(1, dim, 0.0);
868  row(w,0) = solver.solutionWeightVector();
869  model.decisionFunction().setStructure(w);
870  return;
871  }
872
873  double offset = 0;
874  double stepSize = 0.1;
876  while(stepSize > 0.1*QpConfig::stoppingCondition().minAccuracy){
877  offset+= (grad < 0? -stepSize:stepSize);
878  solver.setOffset(offset);
879  solver.solve(
880  base_type::C(),
881  0.0,
884  QpConfig::verbosity() > 0);
887  break;
889  stepSize *= 0.5;
890  else
891  stepSize *= 1.6;
893  }
894
895  RealMatrix w(1, dim, 0.0);
896  noalias(row(w,0)) = solver.solutionWeightVector();
897  model.decisionFunction().setStructure(w,RealVector(1,offset));
898
899  }
900  template<class Solver>
901  void trainMc(LinearClassifier<InputType>& model, LabeledData<InputType, unsigned int> const& dataset, std::size_t classes){
902  std::size_t dim = inputDimension(dataset);
903
904  Solver solver(dataset, dim, classes);
905  RealMatrix w = solver.solve(random::globalRng, this->C(), this->stoppingCondition(), &this->solutionProperties(), this->verbosity() > 0);
906  model.decisionFunction().setStructure(w);
907  }
908
909  void trainOVA(LinearClassifier<InputType>& model, const LabeledData<InputType, unsigned int>& dataset, std::size_t classes)
910  {
911  base_type::m_solutionproperties.type = QpNone;
912  base_type::m_solutionproperties.accuracy = 0.0;
913  base_type::m_solutionproperties.iterations = 0;
914  base_type::m_solutionproperties.value = 0.0;
915  base_type::m_solutionproperties.seconds = 0.0;
916
917  std::size_t dim = inputDimension(dataset);
918  RealMatrix w(classes, dim);
919  for (unsigned int c=0; c<classes; c++)
920  {
922  QpBoxLinear<InputType> solver(bindata, dim);
924  solver.solve(this->C(), 0.0, base_type::m_stoppingcondition, &prop, base_type::m_verbosity > 0);
925  noalias(row(w, c)) = solver.solutionWeightVector();
926  base_type::m_solutionproperties.iterations += prop.iterations;
927  base_type::m_solutionproperties.seconds += prop.seconds;
928  base_type::m_solutionproperties.accuracy = std::max(base_type::solutionProperties().accuracy, prop.accuracy);
929  }
930  model.decisionFunction().setStructure(w);
931  }
932 };
933
934
935 template <class InputType, class CacheType = float>
936 class SquaredHingeCSvmTrainer : public AbstractSvmTrainer<InputType, unsigned int>
937 {
938 public:
939  typedef CacheType QpFloatType;
940
944
948
949  //! Constructor
950  //! \param kernel kernel function to use for training and prediction
951  //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
952  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver??
953  SquaredHingeCSvmTrainer(KernelType* kernel, double C, bool unconstrained = false)
954  : base_type(kernel, C, unconstrained)
955  { }
956
957  //! Constructor
958  //! \param kernel kernel function to use for training and prediction
959  //! \param negativeC regularization parameter of the negative class (label 0)
960  //! \param positiveC regularization parameter of the positive class (label 1)
961  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
962  SquaredHingeCSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool unconstrained = false)
963  : base_type(kernel,negativeC, positiveC, unconstrained)
964  { }
965
966  /// \brief From INameable: return the class name.
967  std::string name() const
968  { return "SquaredHingeCSvmTrainer"; }
969
970  /// \brief Train the C-SVM.
972  {
973  svm.decisionFunction().setStructure(base_type::m_kernel,dataset.inputs(),this->m_trainOffset);
974
975  RealVector diagonalModifier(dataset.numberOfElements(),0.5/base_type::m_regularizers(0));
976  if(base_type::m_regularizers.size() != 1){
977  for(std::size_t i = 0; i != diagonalModifier.size();++i){
978  diagonalModifier(i) = 0.5/base_type::m_regularizers(dataset.element(i).label);
979  }
980  }
981
982  KernelMatrixType km(*base_type::m_kernel, dataset.inputs(),diagonalModifier);
984  {
985  PrecomputedMatrixType matrix(&km);
986  optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
987  }
988  else
989  {
990  CachedMatrixType matrix(&km);
991  optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
992  }
993  base_type::m_accessCount = km.getAccessCount();
994  if (base_type::sparsify()) svm.decisionFunction().sparsify();
995
996  }
997
998 private:
999
1000  template<class Matrix>
1001  void optimize(KernelExpansion<InputType>& svm, Matrix& matrix,RealVector const& diagonalModifier, LabeledData<InputType, unsigned int> const& dataset){
1002  typedef CSVMProblem<Matrix> SVMProblemType;
1003  SVMProblemType svmProblem(matrix,dataset.labels(),1e100);
1004  if (this->m_trainOffset)
1005  {
1006  typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
1007  ProblemType problem(svmProblem,base_type::m_shrinking);
1008  QpSolver< ProblemType > solver(problem);
1009  solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
1010  column(svm.alpha(),0)= problem.getUnpermutedAlpha();
1011  //compute the bias
1012  double sum = 0.0;
1013  std::size_t freeVars = 0;
1014  for (std::size_t i=0; i < problem.dimensions(); i++)
1015  {
1016  if(problem.alpha(i) > problem.boxMin(i) && problem.alpha(i) < problem.boxMax(i)){
1017  sum += problem.gradient(i) - problem.alpha(i)*2*diagonalModifier(i);
1018  freeVars++;
1019  }
1020  }
1021  if (freeVars > 0)
1022  svm.offset(0) = sum / freeVars; //stabilized (averaged) exact value
1023  else
1024  svm.offset(0) = 0;
1025  }
1026  else
1027  {
1029  ProblemType problem(svmProblem,base_type::m_shrinking);
1030  QpSolver< ProblemType > solver(problem);
1031  solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
1032  column(svm.alpha(),0) = problem.getUnpermutedAlpha();
1033
1034  }
1035  }
1036 };
1037
1038
1039 template <class InputType>
1041 {
1042 public:
1044
1045  SquaredHingeLinearCSvmTrainer(double C, bool unconstrained = false)
1046  : AbstractLinearSvmTrainer<InputType>(C, false, unconstrained){}
1047
1048  /// \brief From INameable: return the class name.
1049  std::string name() const
1050  { return "SquaredHingeLinearCSvmTrainer"; }
1051
1053  {
1054  std::size_t dim = inputDimension(dataset);
1055  QpBoxLinear<InputType> solver(dataset, dim);
1056  RealMatrix w(1, dim, 0.0);
1057  solver.solve(
1058  1e100,
1059  1.0 / base_type::C(),
1062  QpConfig::verbosity() > 0);
1063  row(w,0) = solver.solutionWeightVector();
1064  model.decisionFunction().setStructure(w);
1065  }
1066 };
1067
1068
1069 }
1070 #endif