LassoRegression.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief LASSO Regression
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2013
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
37 #define SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
38 
41 #include <cmath>
42 
43 
44 namespace shark {
45 
46 
47 /*!
48  * \brief LASSO Regression
49  *
50  * LASSO Regression extracts a sparse vector of regression
51  * coefficients. The original method amounts to L1-constrained
52  * least squares regression, while this implementation uses an
53  * L1 penalty instead of a constraint (which is equivalent).
54  *
55  * For data vectors \f$ x_i \f$ with real-valued labels \f$ y_i \f$
56  * the trainer solves the problem
57  * \f$ \min_w \quad \frac{1}{2} \sum_i (w^T x_i - y_i)^2 + \lambda \|w\|_1 \f$.
58  * The target accuracy of the solution is measured in terms of the
59  * smallest component of the gradient of the objective function.
60  *
61  * The trainer has one template parameter, namely the type of
62  * the input vectors \f$ x_i \f$. These need to be vector valued,
63  * typically either RealVector of CompressedRealVector. The
64  * resulting weight vector w is represented by a LinearModel
65  * object. Currently model outputs and labels are restricted to a
66  * single dimension.
67  */
68 template <class InputVectorType = RealVector>
69 class LassoRegression : public AbstractTrainer<LinearModel<InputVectorType> >, public IParameterizable<>
70 {
71 public:
74 
75  /// \brief Constructor.
76  ///
77  /// \param lambda value of the regularization parameter (see class description)
78  /// \param accuracy stopping criterion for the iterative solver, maximal gradient component of the objective function (see class description)
79  LassoRegression(double lambda, double accuracy = 0.01)
80  : m_lambda(lambda)
82  {
83  RANGE_CHECK(m_lambda >= 0.0);
84  RANGE_CHECK(m_accuracy > 0.0);
85  }
86 
87  /// \brief From INameable: return the class name.
88  std::string name() const
89  { return "LASSO regression"; }
90 
91 
92  /// \brief Return the current setting of the regularization parameter.
93  double lambda() const
94  {
95  return m_lambda;
96  }
97 
98  /// \brief Set the regularization parameter.
99  void setLambda(double lambda)
100  {
101  RANGE_CHECK(lambda >= 0.0);
102  m_lambda = lambda;
103  }
104 
105  /// \brief Return the current setting of the accuracy (maximal gradient component of the optimization problem).
106  double accuracy() const
107  {
108  return m_accuracy;
109  }
110 
111  /// \brief Set the accuracy (maximal gradient component of the optimization problem).
112  void setAccuracy(double accuracy)
113  {
114  RANGE_CHECK(accuracy > 0.0);
116  }
117 
118  /// \brief Get the regularization parameter lambda through the IParameterizable interface.
119  RealVector parameterVector() const
120  {
121  return RealVector(1, m_lambda);
122  }
123 
124  /// \brief Set the regularization parameter lambda through the IParameterizable interface.
125  void setParameterVector(const RealVector& param)
126  {
127  SIZE_CHECK(param.size() == 1);
128  RANGE_CHECK(param(0) >= 0.0);
129  m_lambda = param(0);
130  }
131 
132  /// \brief Return the number of parameters (one in this case).
133  size_t numberOfParameters() const
134  {
135  return 1;
136  }
137 
138  /// \brief Train a linear model with LASSO regression.
139  void train(ModelType& model, DataType const& dataset){
140 
141  // strategy constants
142  const double CHANGE_RATE = 0.2;
143  const double PREF_MIN = 0.05;
144  const double PREF_MAX = 20.0;
145 
146  // console output
147  const bool verbose = false;
148 
149  std::size_t dim = inputDimension(dataset);
150  RealVector w(dim, 0.0);
151 
152  // transpose the dataset and push it inside a single matrix
153  typename Batch<InputVectorType>::type data = trans(createBatch(dataset.inputs().elements()));
154  RealVector label = column(createBatch(dataset.labels().elements()),0);
155 
156  RealVector diag(dim);
157  RealVector difference = -label;
158  UIntVector index(dim);
159 
160  // pre-calculate diagonal matrix entries (feature-wise squared norms)
161  for (size_t i=0; i<dim; i++){
162  diag[i] = norm_sqr(row(data,i));
163  }
164 
165  // prepare preferences for scheduling
166  RealVector pref(dim, 1.0);
167  double prefsum = (double)dim;
168 
169  // prepare performance monitoring for self-adaptation
170  const double gain_learning_rate = 1.0 / dim;
171  double average_gain = 0.0;
172  bool canstop = true;
173  const double lambda = m_lambda;
174 
175  // main optimization loop
176  std::size_t iter = 0;
177  std::size_t steps = 0;
178  while (true)
179  {
180  double maxvio = 0.0;
181 
182  // define schedule
183  double psum = prefsum;
184  prefsum = 0.0;
185  int pos = 0;
186 
187  for (std::size_t i=0; i<dim; i++)
188  {
189  double p = pref[i];
190  double n;
191  if (psum >= 1e-6 && p < psum)
192  n = (dim - pos) * p / psum;
193  else
194  n = (double)(dim - pos); // for numerical stability
195 
196  unsigned int m = (unsigned int)floor(n);
197  double prob = n - m;
198  if ((double)rand() / (double)RAND_MAX < prob) m++;
199  for (std::size_t j=0; j<m; j++)
200  {
201  index[pos] = (unsigned int)i;
202  pos++;
203  }
204  psum -= p;
205  prefsum += p;
206  }
207  for (std::size_t i=0; i<dim; i++)
208  {
209  std::size_t r = rand() % dim;
210  std::swap(index[r], index[i]);
211  }
212 
213  steps += dim;
214  for (size_t s=0; s<dim; s++)
215  {
216  std::size_t i = index[s];
217  double a = w[i];
218  double d = diag[i];
219 
220  // compute gradient
221  double grad = inner_prod(difference, row(data,i));
222 
223  // compute optimal coordinate descent step and corresponding gain
224  double vio = 0.0;
225  double gain = 0.0;
226  double delta = 0.0;
227  if (a == 0.0)
228  {
229  if (grad > lambda)
230  {
231  vio = grad - lambda;
232  delta = -vio / d;
233  gain = 0.5 * d * delta * delta;
234  }
235  else if (grad < -lambda)
236  {
237  vio = -grad - lambda;
238  delta = vio / d;
239  gain = 0.5 * d * delta * delta;
240  }
241  }
242  else if (a > 0.0)
243  {
244  grad += lambda;
245  vio = std::fabs(grad);
246  delta = -grad / d;
247  if (delta < -a)
248  {
249  delta = -a;
250  gain = delta * (grad - 0.5 * d * delta);
251  double g0 = grad - a * d - 2.0 * lambda;
252  if (g0 > 0.0)
253  {
254  double dd = -g0 / d;
255  gain = dd * (grad - 0.5 * d * dd);
256  delta += dd;
257  }
258  }
259  else gain = 0.5 * d * delta * delta;
260  }
261  else
262  {
263  grad -= lambda;
264  vio = std::fabs(grad);
265  delta = -grad / d;
266  if (delta > -a)
267  {
268  delta = -a;
269  gain = delta * (grad - 0.5 * d * delta);
270  double g0 = grad - a * d + 2.0 * lambda;
271  if (g0 < 0.0)
272  {
273  double dd = -g0 / d;
274  gain = dd * (grad - 0.5 * d * dd);
275  delta += dd;
276  }
277  }
278  else gain = 0.5 * d * delta * delta;
279  }
280 
281  // update state
282  if (vio > maxvio) maxvio = vio;
283  if (delta != 0.0)
284  {
285  w[i] += delta;
286  noalias(difference) += delta*row(data,i);
287  }
288 
289  // update gain-based preferences
290  {
291  if (iter == 0)
292  average_gain += gain / (double)dim;
293  else
294  {
295  double change = CHANGE_RATE * (gain / average_gain - 1.0);
296  double newpref = pref[i] * std::exp(change);
297  newpref = std::min(std::max(newpref, PREF_MIN), PREF_MAX);
298  prefsum += newpref - pref[i];
299  pref[i] = newpref;
300  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
301  }
302  }
303  }
304  iter++;
305 
306  if (maxvio <= m_accuracy)
307  {
308  if (canstop)
309  break;
310  else
311  {
312  // prepare full sweep for a reliable check of the stopping criterion
313  canstop = true;
314  noalias(pref) = blas::repeat(1.0, dim);
315  prefsum = (double)dim;
316  if (verbose) std::cout << "*" << std::flush;
317  }
318  }
319  else
320  {
321  canstop = false;
322  if (verbose) std::cout << "." << std::flush;
323  }
324  }
325 
326  // write the weight vector into the model
327  RealMatrix mat(1, w.size());
328  row(mat, 0) = w;
329  model.setStructure(mat);
330  }
331 
332 protected:
333  double m_lambda; ///< regularization parameter
334  double m_accuracy; ///< gradient accuracy
335 };
336 
337 
338 }
339 #endif