include/shark/Algorithms/QP/QpMcLinear.h Source File

Go to the documentation of this file.
 //===========================================================================
 /*!
  *
  *
  * \brief       Quadratic programming solvers for linear multi-class SVM training without bias.
  *
  *
  *
  * \author      T. Glasmachers
  * \date        -
  *
  *
  * \par Copyright 1995-2017 Shark Development Team
  *
  * <BR><HR>
  * This file is part of Shark.
  * <http://shark-ml.org/>
  *
  * Shark is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * Shark is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
  *
  */
 //===========================================================================
 
 
 #ifndef SHARK_ALGORITHMS_QP_QPMCLINEAR_H
 #define SHARK_ALGORITHMS_QP_QPMCLINEAR_H
 
 #include <shark/Core/Timer.h>
 #include <shark/Algorithms/QP/QuadraticProgram.h>
 #include <shark/Data/Dataset.h>
 #include <shark/Data/DataView.h>
 #include <shark/LinAlg/Base.h>
 #include <cmath>
 #include <iostream>
 #include <vector>
 
 
 namespace shark {
 
 
 /// \brief Generic solver skeleton for linear multi-class SVM problems.
 template <class InputT>
 class QpMcLinear
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
     typedef typename LabeledData<InputT, unsigned int>::const_element_reference ElementType;
     typedef typename Batch<InputT>::const_reference InputReferenceType;
 
     enum CoordinateSelectionStrategy {UNIFORM, ACF};
 
 
     ///
     /// \brief Constructor
     ///
     /// \param  dataset   training data
     /// \param  dim       problem dimension
     /// \param  classes   number of classes in the problem
     /// \param  strategy  coordinate selection strategy
     /// \param  shrinking flag turning shrinking on and off
     ///
     QpMcLinear(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes,
             std::size_t strategy = ACF,
             bool shrinking = false)
     : m_data(dataset)
     , m_xSquared(dataset.numberOfElements())
     , m_dim(dim)
     , m_classes(classes)
     , m_strategy(strategy)
     , m_shrinking(shrinking)
     {
         SHARK_ASSERT(m_dim > 0);
 
         for (std::size_t i=0; i<m_data.size(); i++)
         {
             m_xSquared(i) = inner_prod(m_data[i].input, m_data[i].input);
         }
     }
 
     ///
     /// \brief Solve the SVM training problem.
     ///
     /// \param  rng      random number generator used by the algorithm
     /// \param  C        regularization constant of the SVM
     /// \param  stop     stopping condition(s)
     /// \param  prop     solution properties
     /// \param  verbose  if true, the solver prints status information and solution statistics
     ///
     RealMatrix solve(
         random::rng_type& rng,
         double C,
         QpStoppingCondition& stop,
         QpSolutionProperties* prop = NULL,
         bool verbose = false)
     {
         // sanity checks
         SHARK_ASSERT(C > 0.0);
 
         // measure training time
         Timer timer;
         timer.start();
 
         // prepare dimensions and vectors
         std::size_t ell = m_data.size();             // number of training examples
         RealMatrix alpha(ell, m_classes + 1, 0.0);   // Lagrange multipliers; dual variables. Reserve one extra column.
         RealMatrix w(m_classes, m_dim, 0.0);         // weight vectors; primal variables
 
         // scheduling of steps, for ACF only
         RealVector pref(ell, 1.0);                   // example-wise measure of success
         double prefsum = (double)ell;                // normalization constant
 
         std::vector<std::size_t> schedule(ell);
         if (m_strategy == UNIFORM)
         {
             for (std::size_t i=0; i<ell; i++) schedule[i] = i;
         }
 
         // used for shrinking
         std::size_t active = ell;
 
         // prepare counters
         std::size_t epoch = 0;
         std::size_t steps = 0;
 
         // prepare performance monitoring
         double objective = 0.0;
         double max_violation = 0.0;
 
         // gain for ACF
         const double gain_learning_rate = 1.0 / ell;
         double average_gain = 0.0;
 
 
         // outer optimization loop (epochs)
         bool canstop = true;
         while (true)
         {
             if (m_strategy == ACF)
             {
                 // define schedule
                 double psum = prefsum;
                 prefsum = 0.0;
                 std::size_t pos = 0;
                 for (std::size_t i=0; i<ell; i++)
                 {
                     double p = pref(i);
                     double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
                     std::size_t n = (std::size_t)std::floor(num);
                     double prob = num - n;
                     //~ if (random::coinToss(rng,prob)) n++;
                     if (random::uni(rng) < prob) n++;
                     for (std::size_t j=0; j<n; j++)
                     {
                         schedule[pos] = i;
                         pos++;
                     }
                     psum -= p;
                     prefsum += p;
                 }
                 SHARK_ASSERT(pos == ell);
             }
 
             if (m_shrinking == true)
             {
                 //~ for (std::size_t i=0; i<active; i++) 
                     //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), active - 1)]);
                 std::shuffle(schedule.begin(),schedule.begin()+active,rng);
             }
             else
             {
                 //~ for (std::size_t i=0; i<ell; i++) 
                     //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), ell - 1)]);
                 std::shuffle(schedule.begin(),schedule.end(),rng);
             }
 
             // inner loop (one epoch)
             max_violation = 0.0;
             size_t nPoints = ell;
             if (m_shrinking == true)
                 nPoints = active;
 
             for (std::size_t j=0; j<nPoints; j++)
             {
                 // active example
                 double gain = 0.0;
                 const std::size_t i = schedule[j];
                 InputReferenceType x_i = m_data[i].input;
                 const unsigned int y_i = m_data[i].label;
                 const double q = m_xSquared(i);
                 blas::matrix_row<RealMatrix> a = row(alpha, i);
 
                 // compute gradient and KKT violation
                 RealVector wx = prod(w,x_i);
                 RealVector g(m_classes);
                 double kkt = calcGradient(g, wx, a, C, y_i);
 
                 if (kkt > 0.0)
                 {
                     max_violation = std::max(max_violation, kkt);
 
                     // perform the step on alpha
                     RealVector mu(m_classes, 0.0);
                     gain = solveSub(0.1 * stop.minAccuracy, g, q, C, y_i, a, mu);
                     objective += gain;
                     steps++;
 
                     // update weight vectors
                     updateWeightVectors(w, mu, i);
                 }
                 else if (m_shrinking == true)
                 {
                     active--;
                     std::swap(schedule[j], schedule[active]);
                     j--;
                 }
 
                 // update gain-based preferences
                 if (m_strategy == ACF)
                 {
                     if (epoch == 0) average_gain += gain / (double)ell;
                     else
                     {
                         // strategy constants
                         constexpr double CHANGE_RATE = 0.2;
                         constexpr double PREF_MIN = 0.05;
                         constexpr double PREF_MAX = 20.0;
 
                         double change = CHANGE_RATE * (gain / average_gain - 1.0);
                         double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
                         prefsum += newpref - pref(i);
                         pref(i) = newpref;
                         average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
                     }
                 }
             }
 
             epoch++;
 
             // stopping criteria
             if (stop.maxIterations > 0 && epoch * ell >= stop.maxIterations)
             {
                 if (prop != NULL) prop->type = QpMaxIterationsReached;
                 break;
             }
 
             if (timer.stop() >= stop.maxSeconds)
             {
                 if (prop != NULL) prop->type = QpTimeout;
                 break;
             }
 
             if (max_violation < stop.minAccuracy)
             {
                 if (verbose)
                     std::cout << "#" << std::flush;
                 if (canstop)
                 {
                     if (prop != NULL) prop->type = QpAccuracyReached;
                     break;
                 }
                 else
                 {
                     if (m_strategy == ACF)
                     {
                         // prepare full sweep for a reliable checking of the stopping criterion
                         canstop = true;
                         for (std::size_t i=0; i<ell; i++) pref(i) = 1.0;
                         prefsum = (double)ell;
                     }
 
                     if (m_shrinking == true)
                     {
                         // prepare full sweep for a reliable checking of the stopping criterion
                         active = ell;
                         canstop = true;
                     }
                 }
             }
             else
             {
                 if (verbose) std::cout << "." << std::flush;
                 if (m_strategy == ACF)
                     canstop = false;
                 if (m_shrinking == true)
                     canstop = (active == ell);
             }
         }
         timer.stop();
 
         // calculate dual objective value
         objective = 0.0;
         for (std::size_t j=0; j<m_classes; j++)
         {
             for (std::size_t d=0; d<m_dim; d++) objective -= w(j, d) * w(j, d);
         }
         objective *= 0.5;
         for (std::size_t i=0; i<ell; i++)
         {
             for (std::size_t j=0; j<m_classes; j++) objective += alpha(i, j);
         }
 
         // return solution statistics
         if (prop != NULL)
         {
             prop->accuracy = max_violation;       // this is approximate, but a good guess
             prop->iterations = ell * epoch;
             prop->value = objective;
             prop->seconds = timer.lastLap();
         }
 
         // output solution statistics
         if (verbose)
         {
             std::cout << std::endl;
             std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
             std::cout << "number of epochs: " << epoch << std::endl;
             std::cout << "number of iterations: " << (ell * epoch) << std::endl;
             std::cout << "number of non-zero steps: " << steps << std::endl;
             std::cout << "dual accuracy: " << max_violation << std::endl;
             std::cout << "dual objective value: " << objective << std::endl;
         }
 
         // return the solution
         return w;
     }
 
 protected:
     // for all c: row(w, c) += mu(c) * x
     void add_scaled(RealMatrix& w, RealVector const& mu, InputReferenceType x)
     {
         for (std::size_t c=0; c<m_classes; c++) noalias(row(w, c)) += mu(c) * x;
     }
 
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     ///
     /// \param  gradient  gradient vector to be filled in. The vector is correctly sized.
     /// \param  wx        inner products of weight vectors with the current sample; wx(c) = <w_c, x>
     /// \param  alpha     variables corresponding to the current sample
     /// \param  C         upper bound on the variables
     /// \param  y         label of the current sample
     ///
     /// \return  The function must return the violation of the KKT conditions.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y) = 0;
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     ///
     /// \param  w   matrix of (dense) weight vectors (as rows)
     /// \param  mu  dual step on the variables corresponding to the current sample
     /// \param  index   current sample
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index) = 0;
 
     /// \brief Solve the sub-problem posed by a single training example.
     ///
     /// \param  epsilon   accuracy (dual gradient) up to which the sub-problem should be solved
     /// \param  gradient  gradient of the objective function w.r.t. alpha
     /// \param  q         squared norm of the current sample
     /// \param  C         upper bound on the variables
     /// \param  y         label of the current sample
     /// \param  alpha     input: initial point; output: (near) optimal point
     /// \param  mu        step from initial point to final point
     ///
     /// \return  The function must return the gain of the step, i.e., the improvement of the objective function.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu) = 0;
 
     DataView<const DatasetType> m_data;               ///< view on training data
     RealVector m_xSquared;                            ///< diagonal entries of the quadratic matrix
     std::size_t m_dim;                                ///< input space dimension
     std::size_t m_classes;                            ///< number of classes
     std::size_t m_strategy;                         ///< strategy for coordinate selection
     bool m_shrinking;                               ///< apply shrinking or not?
 };
 
 /// \brief Solver for the multi-class SVM by Weston & Watkins.
 template <class InputT>
 class QpMcLinearWW : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearWW(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         double violation = 0.0;
         for (std::size_t c=0; c<wx.size(); c++)
         {
             if (c == y)
             {
                 gradient(c) = 0.0;
             }
             else
             {
                 const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                 gradient(c) = g;
                 if (g > violation && alpha(c) < C) violation = g;
                 else if (-g > violation && alpha(c) > 0.0) violation = -g;
             }
         }
         return violation;
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         double sum_mu = 0.0;
         for (std::size_t c=0; c<m_classes; c++) sum_mu += mu(c);
         unsigned int y = m_data[index].label;
         RealVector step(-0.5 * mu);
         step(y) = 0.5 * sum_mu;
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double qq = 0.5 * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             double kkt = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 if (c == y) continue;
 
                 const double g = gradient(c);
                 const double a = alpha(c);
                 if (g > kkt && a < C) { kkt = g; idx = c; }
                 else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
             }
 
             // check stopping criterion
             if (kkt < epsilon) break;
 
             // perform step
             const double a = alpha(idx);
             const double g = gradient(idx);
             double m = g / qq;
             double a_new = a + m;
             if (a_new <= 0.0)
             {
                 m = -a;
                 a_new = 0.0;
             }
             else if (a_new >= C)
             {
                 m = C - a;
                 a_new = C;
             }
             alpha(idx) = a_new;
             mu(idx) += m;
 
             // update gradient and total gain
             const double dg = 0.5 * m * qq;
             for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
             gradient(idx) -= dg;
 
             gain += m * (g - dg);
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class SVM by Lee, Lin & Wahba.
 template <class InputT>
 class QpMcLinearLLW : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearLLW(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         double violation = 0.0;
         for (std::size_t c=0; c<m_classes; c++)
         {
             if (c == y)
             {
                 gradient(c) = 0.0;
             }
             else
             {
                 const double g = 1.0 + wx(c);
                 gradient(c) = g;
                 if (g > violation && alpha(c) < C) violation = g;
                 else if (-g > violation && alpha(c) > 0.0) violation = -g;
             }
         }
         return violation;
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         double mean_mu = 0.0;
         for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
         mean_mu /= (double)m_classes;
         RealVector step(m_classes);
         for (std::size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             double kkt = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 if (c == y) continue;
 
                 const double g = gradient(c);
                 const double a = alpha(c);
                 if (g > kkt && a < C) { kkt = g; idx = c; }
                 else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
             }
 
             // check stopping criterion
             if (kkt < epsilon) break;
 
             // perform step
             const double a = alpha(idx);
             const double g = gradient(idx);
             double m = g / qq;
             double a_new = a + m;
             if (a_new <= 0.0)
             {
                 m = -a;
                 a_new = 0.0;
             }
             else if (a_new >= C)
             {
                 m = C - a;
                 a_new = C;
             }
             alpha(idx) = a_new;
             mu(idx) += m;
 
             // update gradient and total gain
             const double dg = m * q;
             const double dgc = dg / m_classes;
             for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
             gradient(idx) -= dg;
 
             gain += m * (g - 0.5 * (dg - dgc));
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class SVM with absolute margin and total sum loss.
 template <class InputT>
 class QpMcLinearATS : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearATS(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         double violation = 0.0;
         for (std::size_t c=0; c<m_classes; c++)
         {
             const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
             gradient(c) = g;
             if (g > violation && alpha(c) < C) violation = g;
             else if (-g > violation && alpha(c) > 0.0) violation = -g;
         }
         return violation;
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         unsigned int y = m_data[index].label;
         double mean = -2.0 * mu(y);
         for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
         mean /= (double)m_classes;
         RealVector step(m_classes);
         for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             double kkt = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 const double g = gradient(c);
                 const double a = alpha(c);
                 if (g > kkt && a < C) { kkt = g; idx = c; }
                 else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
             }
 
             // check stopping criterion
             if (kkt < epsilon) break;
 
             // perform step
             const double a = alpha(idx);
             const double g = gradient(idx);
             double m = g / qq;
             double a_new = a + m;
             if (a_new <= 0.0)
             {
                 m = -a;
                 a_new = 0.0;
             }
             else if (a_new >= C)
             {
                 m = C - a;
                 a_new = C;
             }
             alpha(idx) = a_new;
             mu(idx) += m;
 
             // update gradient and total gain
             const double dg = m * q;
             const double dgc = dg / m_classes;
             if (idx == y)
             {
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                 gradient(idx) -= dg - 2.0 * dgc;
             }
             else
             {
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                 gradient(idx) -= dg;
             }
 
             gain += m * (g - 0.5 * (dg - dgc));
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class maximum margin regression SVM
 template <class InputT>
 class QpMcLinearMMR : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearMMR(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         for (std::size_t c=0; c<m_classes; c++) gradient(c) = 0.0;
         const double g = 1.0 - wx(y);
         gradient(y) = g;
         const double a = alpha(0);
         if (g > 0.0)
         {
             if (a == C) return 0.0;
             else return g;
         }
         else
         {
             if (a == 0.0) return 0.0;
             else return -g;
         }
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         unsigned int y = m_data[index].label;
         double s = mu(0);
         double sc = -s / m_classes;
         double sy = s + sc;
         RealVector step(m_classes);
         for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? sy : sc;
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
 
         double kkt = 0.0;
         const double g = gradient(y);
         const double a = alpha(0);
         if (g > kkt && a < C) kkt = g;
         else if (-g > kkt && a > 0.0) kkt = -g;
 
         // check stopping criterion
         if (kkt < epsilon) return 0.0;
 
         // perform step
         double m = g / qq;
         double a_new = a + m;
         if (a_new <= 0.0)
         {
             m = -a;
             a_new = 0.0;
         }
         else if (a_new >= C)
         {
             m = C - a;
             a_new = C;
         }
         alpha(0) = a_new;
         mu(0) = m;
 
         // return the gain
         return m * (g - 0.5 * m * qq);
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class SVM by Crammer & Singer.
 template <class InputT>
 class QpMcLinearCS : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearCS(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         if (alpha(m_classes) < C)
         {
             double violation = 0.0;
             for (std::size_t c=0; c<wx.size(); c++)
             {
                 if (c == y)
                 {
                     gradient(c) = 0.0;
                 }
                 else
                 {
                     const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                     gradient(c) = g;
                     if (g > violation) violation = g;
                     else if (-g > violation && alpha(c) > 0.0) violation = -g;
                 }
             }
             return violation;
         }
         else
         {
             // double kkt_up = -1e100, kkt_down = 1e100;
             double kkt_up = 0.0, kkt_down = 1e100;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 if (c == y)
                 {
                     gradient(c) = 0.0;
                 }
                 else
                 {
                     const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                     gradient(c) = g;
                     if (g > kkt_up && alpha(c) < C) kkt_up = g;
                     if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
                 }
             }
             return std::max(0.0, kkt_up - kkt_down);
         }
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         unsigned int y = m_data[index].label;
         double sum_mu = 0.0;
         for (std::size_t c=0; c<m_classes; c++) if (c != y) sum_mu += mu(c);
         RealVector step(-0.5 * mu);
         step(y) = 0.5 * sum_mu;
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double qq = 0.5 * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             std::size_t idx_up = 0, idx_down = 0;
             bool size2 = false;
             double kkt = 0.0;
             double grad = 0.0;
             if (alpha(m_classes) == C)
             {
                 double kkt_up = -1e100, kkt_down = 1e100;
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     if (c == y) continue;
 
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                     if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                 }
 
                 if (kkt_up <= 0.0)
                 {
                     idx = idx_down;
                     grad = kkt_down;
                     kkt = -kkt_down;
                 }
                 else
                 {
                     grad = kkt_up - kkt_down;
                     kkt = grad;
                     size2 = true;
                 }
             }
             else
             {
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     if (c == y) continue;
 
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt) { kkt = g; idx = c; }
                     else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                 }
                 grad = gradient(idx);
             }
 
             // check stopping criterion
             if (kkt < epsilon) return gain;
 
             if (size2)
             {
                 // perform step
                 const double a_up = alpha(idx_up);
                 const double a_down = alpha(idx_down);
                 double m = grad / qq;
                 double a_up_new = a_up + m;
                 double a_down_new = a_down - m;
                 if (a_down_new <= 0.0)
                 {
                     m = a_down;
                     a_up_new = a_up + m;
                     a_down_new = 0.0;
                 }
                 alpha(idx_up) = a_up_new;
                 alpha(idx_down) = a_down_new;
                 mu(idx_up) += m;
                 mu(idx_down) -= m;
 
                 // update gradient and total gain
                 const double dg = 0.5 * m * qq;
                 gradient(idx_up) -= dg;
                 gradient(idx_down) += dg;
                 gain += m * (grad - 2.0 * dg);
             }
             else
             {
                 // perform step
                 const double a = alpha(idx);
                 const double a_sum = alpha(m_classes);
                 double m = grad / qq;
                 double a_new = a + m;
                 double a_sum_new = a_sum + m;
                 if (a_new <= 0.0)
                 {
                     m = -a;
                     a_new = 0.0;
                     a_sum_new = a_sum + m;
                 }
                 else if (a_sum_new >= C)
                 {
                     m = C - a_sum;
                     a_sum_new = C;
                     a_new = a + m;
                 }
                 alpha(idx) = a_new;
                 alpha(m_classes) = a_sum_new;
                 mu(idx) += m;
 
                 // update gradient and total gain
                 const double dg = 0.5 * m * qq;
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
                 gradient(idx) -= dg;
                 gain += m * (grad - dg);
             }
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class SVM with absolute margin and discriminative maximum loss.
 template <class InputT>
 class QpMcLinearADM : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearADM(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         if (alpha(m_classes) < C)
         {
             double violation = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 if (c == y)
                 {
                     gradient(c) = 0.0;
                 }
                 else
                 {
                     const double g = 1.0 + wx(c);
                     gradient(c) = g;
                     if (g > violation) violation = g;
                     else if (-g > violation && alpha(c) > 0.0) violation = -g;
                 }
             }
             return violation;
         }
         else
         {
             double kkt_up = 0.0, kkt_down = 1e100;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 if (c == y)
                 {
                     gradient(c) = 0.0;
                 }
                 else
                 {
                     const double g = 1.0 + wx(c);
                     gradient(c) = g;
                     if (g > kkt_up && alpha(c) < C) kkt_up = g;
                     if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
                 }
             }
             return std::max(0.0, kkt_up - kkt_down);
         }
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         double mean_mu = 0.0;
         for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
         mean_mu /= (double)m_classes;
         RealVector step(m_classes);
         for (size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             std::size_t idx_up = 0, idx_down = 0;
             bool size2 = false;
             double kkt = 0.0;
             double grad = 0.0;
             if (alpha(m_classes) == C)
             {
                 double kkt_up = -1e100, kkt_down = 1e100;
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     if (c == y) continue;
 
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                     if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                 }
 
                 if (kkt_up <= 0.0)
                 {
                     idx = idx_down;
                     grad = kkt_down;
                     kkt = -kkt_down;
                 }
                 else
                 {
                     grad = kkt_up - kkt_down;
                     kkt = grad;
                     size2 = true;
                 }
             }
             else
             {
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     if (c == y) continue;
 
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt) { kkt = g; idx = c; }
                     else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                 }
                 grad = gradient(idx);
             }
 
             // check stopping criterion
             if (kkt < epsilon) return gain;
 
             if (size2)
             {
                 // perform step
                 const double a_up = alpha(idx_up);
                 const double a_down = alpha(idx_down);
                 double m = grad / (2.0 * q);
                 double a_up_new = a_up + m;
                 double a_down_new = a_down - m;
                 if (a_down_new <= 0.0)
                 {
                     m = a_down;
                     a_up_new = a_up + m;
                     a_down_new = 0.0;
                 }
                 alpha(idx_up) = a_up_new;
                 alpha(idx_down) = a_down_new;
                 mu(idx_up) += m;
                 mu(idx_down) -= m;
 
                 // update gradient and total gain
                 const double dg = m * q;
                 const double dgc = dg / m_classes;
                 gradient(idx_up) -= dg;
                 gradient(idx_down) += dg;
                 gain += m * (grad - (dg - dgc));
             }
             else
             {
                 // perform step
                 const double a = alpha(idx);
                 const double a_sum = alpha(m_classes);
                 double m = grad / qq;
                 double a_new = a + m;
                 double a_sum_new = a_sum + m;
                 if (a_new <= 0.0)
                 {
                     m = -a;
                     a_new = 0.0;
                     a_sum_new = a_sum + m;
                 }
                 else if (a_sum_new >= C)
                 {
                     m = C - a_sum;
                     a_sum_new = C;
                     a_new = a + m;
                 }
                 alpha(idx) = a_new;
                 alpha(m_classes) = a_sum_new;
                 mu(idx) += m;
 
                 // update gradient and total gain
                 const double dg = m * q;
                 const double dgc = dg / m_classes;
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
                 gradient(idx) -= dg;
                 gain += m * (grad - 0.5 * (dg - dgc));
             }
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the multi-class SVM with absolute margin and total maximum loss.
 template <class InputT>
 class QpMcLinearATM : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearATM(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         if (alpha(m_classes) < C)
         {
             double violation = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
                 gradient(c) = g;
                 if (g > violation) violation = g;
                 else if (-g > violation && alpha(c) > 0.0) violation = -g;
             }
             return violation;
         }
         else
         {
             double kkt_up = 0.0, kkt_down = 1e100;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
                 gradient(c) = g;
                 if (g > kkt_up && alpha(c) < C) kkt_up = g;
                 if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
             }
             return std::max(0.0, kkt_up - kkt_down);
         }
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         unsigned int y = m_data[index].label;
         double mean = -2.0 * mu(y);
         for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
         mean /= (double)m_classes;
         RealVector step(m_classes);
         for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? (mu(c) + mean) : (mean - mu(c));
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             std::size_t idx_up = 0, idx_down = 0;
             bool size2 = false;
             double kkt = 0.0;
             double grad = 0.0;
             if (alpha(m_classes) == C)
             {
                 double kkt_up = -1e100, kkt_down = 1e100;
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                     if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                 }
 
                 if (kkt_up <= 0.0)
                 {
                     idx = idx_down;
                     grad = kkt_down;
                     kkt = -kkt_down;
                 }
                 else
                 {
                     grad = kkt_up - kkt_down;
                     kkt = grad;
                     size2 = true;
                 }
             }
             else
             {
                 for (std::size_t c=0; c<m_classes; c++)
                 {
                     const double g = gradient(c);
                     const double a = alpha(c);
                     if (g > kkt) { kkt = g; idx = c; }
                     else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                 }
                 grad = gradient(idx);
             }
 
             // check stopping criterion
             if (kkt < epsilon) return gain;
 
             if (size2)
             {
                 // perform step
                 const double a_up = alpha(idx_up);
                 const double a_down = alpha(idx_down);
                 double m = grad / (2.0 * q);
                 double a_up_new = a_up + m;
                 double a_down_new = a_down - m;
                 if (a_down_new <= 0.0)
                 {
                     m = a_down;
                     a_up_new = a_up + m;
                     a_down_new = 0.0;
                 }
                 alpha(idx_up) = a_up_new;
                 alpha(idx_down) = a_down_new;
                 mu(idx_up) += m;
                 mu(idx_down) -= m;
 
                 // update gradient and total gain
                 const double dg = m * q;
                 const double dgc = dg / m_classes;
                 if (idx_up == y)
                 {
                     for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                     gradient(idx_up) -= dg - 2.0 * dgc;
                     gradient(idx_down) += dg;
                 }
                 else if (idx_down == y)
                 {
                     gradient(idx_up) -= dg;
                     gradient(idx_down) += dg - 2.0 * dgc;
                 }
                 else
                 {
                     gradient(idx_up) -= dg;
                     gradient(idx_down) += dg;
                 }
                 gain += m * (grad - (dg - dgc));
             }
             else
             {
                 // perform step
                 const double a = alpha(idx);
                 const double a_sum = alpha(m_classes);
                 double m = grad / qq;
                 double a_new = a + m;
                 double a_sum_new = a_sum + m;
                 if (a_new <= 0.0)
                 {
                     m = -a;
                     a_new = 0.0;
                     a_sum_new = a_sum + m;
                 }
                 else if (a_sum_new >= C)
                 {
                     m = C - a_sum;
                     a_sum_new = C;
                     a_new = a + m;
                 }
                 alpha(idx) = a_new;
                 alpha(m_classes) = a_sum_new;
                 mu(idx) += m;
 
                 // update gradient and total gain
                 const double dg = m * q;
                 const double dgc = dg / m_classes;
                 if (idx == y)
                 {
                     for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                     gradient(idx) -= dg - 2.0 * dgc;
                 }
                 else
                 {
                     for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                     gradient(idx) -= dg;
                 }
                 gain += m * (grad - 0.5 * (dg - dgc));
             }
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 /// \brief Solver for the "reinforced" multi-class SVM.
 template <class InputT>
 class QpMcLinearReinforced : public QpMcLinear<InputT>
 {
 public:
     typedef LabeledData<InputT, unsigned int> DatasetType;
 
     /// \brief Constructor
     QpMcLinearReinforced(
             const DatasetType& dataset,
             std::size_t dim,
             std::size_t classes)
     : QpMcLinear<InputT>(dataset, dim, classes)
     { }
 
 protected:
     /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
     virtual double calcGradient(RealVector& gradient, RealVector wx, blas::matrix_row<RealMatrix> const& alpha, double C, unsigned int y)
     {
         double violation = 0.0;
         for (std::size_t c=0; c<m_classes; c++)
         {
             const double g = (c == y) ? (m_classes - 1.0) - wx(y) : 1.0 + wx(c);
             gradient(c) = g;
             if (g > violation && alpha(c) < C) violation = g;
             else if (-g > violation && alpha(c) > 0.0) violation = -g;
         }
         return violation;
     }
 
     /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
     virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
     {
         unsigned int y = m_data[index].label;
         double mean = -2.0 * mu(y);
         for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
         mean /= (double)m_classes;
         RealVector step(m_classes);
         for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
         add_scaled(w, step, m_data[index].input);
     }
 
     /// \brief Solve the sub-problem posed by a single training example.
     virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, blas::matrix_row<RealMatrix>& alpha, RealVector& mu)
     {
         const double ood = 1.0 / m_classes;
         const double qq = (1.0 - ood) * q;
         double gain = 0.0;
 
         // SMO loop
         size_t iter, maxiter = 10 * m_classes;
         for (iter=0; iter<maxiter; iter++)
         {
             // select working set
             std::size_t idx = 0;
             double kkt = 0.0;
             for (std::size_t c=0; c<m_classes; c++)
             {
                 const double g = gradient(c);
                 const double a = alpha(c);
                 if (g > kkt && a < C) { kkt = g; idx = c; }
                 else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
             }
 
             // check stopping criterion
             if (kkt < epsilon) break;
 
             // perform step
             const double a = alpha(idx);
             const double g = gradient(idx);
             double m = g / qq;
             double a_new = a + m;
             if (a_new <= 0.0)
             {
                 m = -a;
                 a_new = 0.0;
             }
             else if (a_new >= C)
             {
                 m = C - a;
                 a_new = C;
             }
             alpha(idx) = a_new;
             mu(idx) += m;
 
             // update gradient and total gain
             const double dg = m * q;
             const double dgc = dg / m_classes;
             if (idx == y)
             {
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                 gradient(idx) -= dg - 2.0 * dgc;
             }
             else
             {
                 for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                 gradient(idx) -= dg;
             }
 
             gain += m * (g - 0.5 * (dg - dgc));
         }
 
         return gain;
     }
 
 protected:
     using QpMcLinear<InputT>::add_scaled;
     using QpMcLinear<InputT>::m_data;
     using QpMcLinear<InputT>::m_classes;
 };
 
 
 }
 #endif