KernelSGDTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Generic stochastic gradient descent training for kernel-based models.
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date 2013
12  *
13  *
14  * \par Copyright 1995-2017 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://shark-ml.org/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_KERNELSGDTRAINER_H
38 #define SHARK_ALGORITHMS_KERNELSGDTRAINER_H
39 
40 
48 
49 
50 namespace shark
51 {
52 
53 
54 ///
55 /// \brief Generic stochastic gradient descent training for kernel-based models.
56 ///
57 /// Given a differentiable loss function L(f, y) for classification
58 /// this trainer solves the regularized risk minimization problem
59 /// \f[
60 /// \min \frac{1}{2} \sum_j \|w_j\|^2 + C \sum_i L(y_i, f(x_i)),
61 /// \f]
62 /// where i runs over training data, j over classes, and C > 0 is the
63 /// regularization parameter.
64 ///
65 /// \par
66 /// This implementation is an adaptation of the PEGASOS algorithm, see the paper
67 /// <i>Shalev-Shwartz et al. "Pegasos: Primal estimated sub-gradient solver for SVM." Mathematical Programming 127.1 (2011): 3-30.</i><br/><br/>
68 /// However, the (non-essential) projection step is dropped, and the
69 /// algorithm is applied to a kernelized model. The resulting
70 /// optimization scheme amounts to plain standard stochastic gradient
71 /// descent (SGD) with update steps of the form
72 /// \f[
73 /// w_j \leftarrow (1 - 1/t) w_j + \frac{C}{t} \frac{\partial L(y_i, f(x_i))}{\partial w_j}
74 /// \f]
75 /// for random index i. The only notable trick borrowed from that paper
76 /// is the representation of the weight vectors in the form
77 /// \f[
78 /// w_j = s \cdot \sum_i \alpha_{i,j} k(x_i, \cdot)
79 /// \f]
80 /// with a scalar factor s (called alphaScale in the code). This enables
81 /// scaling with factor (1 - 1/t) in constant time.
82 ///
83 /// \par
84 /// NOTE: Being an SGD-based solver, this algorithm is relatively fast for
85 /// differentiable loss functions such as the logistic loss (class CrossEntropy).
86 /// It suffers from significantly slower convergence for non-differentiable
87 /// losses, e.g., the hinge loss for SVM training.
88 ///
89 template <class InputType, class CacheType = float>
90 class KernelSGDTrainer : public AbstractTrainer< KernelClassifier<InputType> >, public IParameterizable<>
91 {
92 public:
99  typedef CacheType QpFloatType;
100 
103 
104 
105  /// \brief Constructor
106  ///
107  /// \param kernel kernel function to use for training and prediction
108  /// \param loss (sub-)differentiable loss function
109  /// \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
110  /// \param offset whether to train with offset/bias parameter or not
111  /// \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
112  /// \param cacheSize size of the cache
113  KernelSGDTrainer(KernelType* kernel, const LossType* loss, double C, bool offset, bool unconstrained = false, size_t cacheSize = 0x4000000)
114  : m_kernel(kernel)
115  , m_loss(loss)
116  , m_C(C)
117  , m_offset(offset)
118  , m_unconstrained(unconstrained)
119  , m_epochs(0)
121  { }
122 
123 
124  /// return current cachesize
125  double cacheSize() const
126  {
127  return m_cacheSize;
128  }
129 
130 
131  void setCacheSize(std::size_t size)
132  {
133  m_cacheSize = size;
134  }
135 
136  /// \brief From INameable: return the class name.
137  std::string name() const
138  { return "KernelSGDTrainer"; }
139 
140  void train(ClassifierType& classifier, const LabeledData<InputType, unsigned int>& dataset)
141  {
142  std::size_t ell = dataset.numberOfElements();
143  std::size_t classes = numberOfClasses(dataset);
144  ModelType& model = classifier.decisionFunction();
145 
146  model.setStructure(m_kernel, dataset.inputs(), m_offset, classes);
147 
148  RealMatrix& alpha = model.alpha();
149 
150  // pre-compute the kernel matrix (may change in the future)
151  // and create linear array of labels
152  KernelMatrixType km(*(this->m_kernel), dataset.inputs());
153  PartlyPrecomputedMatrixType K(&km, m_cacheSize);
154  UIntVector y = createBatch(dataset.labels().elements());
155  const double lambda = 0.5 / (ell * m_C);
156 
157  double alphaScale = 1.0;
158  std::size_t iterations;
159  if(m_epochs == 0) iterations = std::max(10 * ell, std::size_t(std::ceil(m_C * ell)));
160  else iterations = m_epochs * ell;
161 
162  // preinitialize everything to prevent costly memory allocations in the loop
163  RealVector f_b(classes, 0.0);
164  RealVector derivative(classes, 0.0);
165 
166  // SGD loop
167  blas::vector<QpFloatType> kernelRow(ell, 0);
168  for(std::size_t iter = 0; iter < iterations; iter++)
169  {
170  // active variable
171  std::size_t b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
172 
173  // learning rate
174  const double eta = 1.0 / (lambda * (iter + ell));
175 
176  // compute prediction
177  K.row(b, kernelRow);
178  noalias(f_b) = alphaScale * prod(trans(alpha), kernelRow);
179  if(m_offset) noalias(f_b) += model.offset();
180 
181  // stochastic gradient descent (SGD) step
182  derivative.clear();
183  m_loss->evalDerivative(y[b], f_b, derivative);
184 
185  // alphaScale *= (1.0 - eta * lambda);
186  alphaScale = (ell - 1.0) / (ell + iter); // numerically more stable
187 
188  noalias(row(alpha, b)) -= (eta / alphaScale) * derivative;
189  if(m_offset) noalias(model.offset()) -= eta * derivative;
190  }
191 
192  alpha *= alphaScale;
193 
194  // model.sparsify();
195  }
196 
197  /// Return the number of training epochs.
198  /// A value of 0 indicates that the default of max(10, C) should be used.
199  std::size_t epochs() const
200  { return m_epochs; }
201 
202  /// Set the number of training epochs.
203  /// A value of 0 indicates that the default of max(10, C) should be used.
204  void setEpochs(std::size_t value)
205  { m_epochs = value; }
206 
207  /// get the kernel function
208  KernelType* kernel()
209  { return m_kernel; }
210  /// get the kernel function
211  const KernelType* kernel() const
212  { return m_kernel; }
213  /// set the kernel function
214  void setKernel(KernelType* kernel)
215  { m_kernel = kernel; }
216 
217  /// check whether the parameter C is represented as log(C), thus,
218  /// in a form suitable for unconstrained optimization, in the
219  /// parameter vector
220  bool isUnconstrained() const
221  { return m_unconstrained; }
222 
223  /// return the value of the regularization parameter
224  double C() const
225  { return m_C; }
226 
227  /// set the value of the regularization parameter (must be positive)
228  void setC(double value)
229  {
230  RANGE_CHECK(value > 0.0);
231  m_C = value;
232  }
233 
234  /// check whether the model to be trained should include an offset term
235  bool trainOffset() const
236  { return m_offset; }
237 
238  ///\brief Returns the vector of hyper-parameters.
239  RealVector parameterVector() const{
240  double parC= m_unconstrained? std::log(m_C): m_C;
241  return m_kernel->parameterVector() | parC;
242  }
243 
244  ///\brief Sets the vector of hyper-parameters.
245  void setParameterVector(RealVector const& newParameters){
246  size_t kp = m_kernel->numberOfParameters();
247  SHARK_ASSERT(newParameters.size() == kp + 1);
248  m_kernel->setParameterVector(subrange(newParameters,0,kp));
249  m_C = newParameters.back();
250  if(m_unconstrained) m_C = exp(m_C);
251  }
252 
253  ///\brief Returns the number of hyper-parameters.
254  size_t numberOfParameters() const{
255  return m_kernel->numberOfParameters() + 1;
256  }
257 
258 protected:
259  KernelType* m_kernel; ///< pointer to kernel function
260  const LossType* m_loss; ///< pointer to loss function
261  double m_C; ///< regularization parameter
262  bool m_offset; ///< should the resulting model have an offset term?
263  bool m_unconstrained; ///< should C be stored as log(C) as a parameter?
264  std::size_t m_epochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
265 
266  // size of cache to use.
267  std::size_t m_cacheSize;
268 
269 };
270 
271 
272 }
273 #endif