LogisticRegression.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Logistic Regression
6  *
7  *
8  *
9  * \author O.Krause
10  * \date 2017
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_TRAINERS_LOGISTICREGRESSION_H
37 #define SHARK_ALGORITHMS_TRAINERS_LOGISTICREGRESSION_H
38 
46 #include <cmath>
47 
48 
49 namespace shark {
50 
51 /// \brief Trainer for Logistic regression
52 ///
53 /// Logistic regression solves the following optimization problem:
54 /// \f[ \min_{w,b} \sum_i u_i l(y_i,f(x_i^Tw+b)) +\lambda_1 |w|_1 +\lambda_2 |w|^2_2 \f]
55 /// Where \f$l\f$ is the cross-entropy loss and \f$u_i\f$ are individual weuights for each point(assumed to be 1).
56 /// Logistic regression is one of the most well known
57 /// machine learning algorithms for classification using linear models.
58 ///
59 /// The solver is based on LBFGS for the case where no l1-regularization is used. Otherwise
60 /// the problem is transformed into a constrained problem and the constrined-LBFGS algorithm
61 /// is used. This is one of the most efficient solvers for logistic regression as long as the
62 /// number of data points is not too large.
63 template <class InputVectorType = RealVector>
64 class LogisticRegression : public AbstractWeightedTrainer<LinearClassifier<InputVectorType> >, public IParameterizable<>
65 {
66 private:
68 public:
69  typedef typename base_type::ModelType ModelType;
72 
73  /// \brief Constructor.
74  ///
75  /// \param lambda1 value of the 1-norm regularization parameter (see class description)
76  /// \param lambda2 value of the 2-norm regularization parameter (see class description)
77  /// \param lbias whether to train with bias or not
78  /// \param accuracy stopping criterion for the iterative solver, maximal gradient component of the objective function (see class description)
79  LogisticRegression(double lambda1 = 0, double lambda2 = 0, bool bias = true, double accuracy = 1.e-8)
80  : m_bias(bias){
84  }
85 
86  /// \brief From INameable: return the class name.
87  std::string name() const
88  { return "LogisticRegression"; }
89 
90 
91  /// \brief Return the current setting of the l1-regularization parameter.
92  double lambda1() const{
93  return m_lambda1;
94  }
95 
96  /// \brief Return the current setting of the l2-regularization parameter.
97  double lambda2() const{
98  return m_lambda2;
99  }
100 
101  /// \brief Set the l1-regularization parameter.
102  void setLambda1(double lambda){
103  SHARK_RUNTIME_CHECK(lambda >= 0.0, "Lambda1 must be positive");
104  m_lambda1 = lambda;
105  }
106 
107  /// \brief Set the l2-regularization parameter.
108  void setLambda2(double lambda){
109  SHARK_RUNTIME_CHECK(lambda >= 0.0, "Lambda2 must be positive");
110  m_lambda2 = lambda;
111  }
112  /// \brief Return the current setting of the accuracy (maximal gradient component of the optimization problem).
113  double accuracy() const{
114  return m_accuracy;
115  }
116 
117  /// \brief Set the accuracy (maximal gradient component of the optimization problem).
118  void setAccuracy(double accuracy){
119  SHARK_RUNTIME_CHECK(accuracy > 0.0, "Accuracy must be positive");
120  m_accuracy = accuracy;
121  }
122 
123  /// \brief Get the regularization parameters lambda1 and lambda2 through the IParameterizable interface.
124  RealVector parameterVector() const{
125  return {m_lambda1,m_lambda2};
126  }
127 
128  /// \brief Set the regularization parameters lambda1 and lambda2 through the IParameterizable interface.
129  void setParameterVector(const RealVector& param){
130  SIZE_CHECK(param.size() == 1);
131  setLambda1(param(0));
132  setLambda2(param(1));
133  }
134 
135  /// \brief Return the number of parameters (one in this case).
136  size_t numberOfParameters() const{
137  return 2;
138  }
139 
140  /// \brief Train a linear model with logistic regression.
141  void train(ModelType& model, DatasetType const& dataset){
142  optimize(model, dataset);
143  }
144 
145  /// \brief Train a linear model with logistic regression using weights.
146  void train(ModelType& model, WeightedDatasetType const& dataset){
147  optimize(model, dataset);
148  }
149 
150 private:
151 
152  template<class DatasetT>
153  void optimize(ModelType& model, DatasetT const& dataset){
154  //initialize model
155  std::size_t numOutputs = numberOfClasses(dataset);
156  if(numOutputs == 2) numOutputs = 1;
157  auto& innerModel = model.decisionFunction();
158  innerModel.setStructure(inputDimension(dataset),numOutputs, m_bias);
159  std::size_t dim = innerModel.numberOfParameters();
160  innerModel.setParameterVector(RealVector(dim,0.0));
161 
162  //setup error function
163  CrossEntropy loss;
164  ErrorFunction error(dataset, &innerModel, &loss);//note: chooses a different implementation depending on the dataset type
165 
166  //handle two-norm regularization
167  TwoNormRegularizer regularizer;
168  if(m_lambda2 > 0.0){
169  //set mask to skip bias weights
170  if(m_bias){
171  RealVector mask(dim,1.0);
172  subrange(mask,dim - numOutputs, dim).clear();
173  regularizer.setMask(mask);
174  }
175  error.setRegularizer(m_lambda2, &regularizer);
176  }
177 
178  //no l1-regularization needed -> simple case
179  if(m_lambda1 == 0){
180  LBFGS optimizer;
181  error.init();
182  optimizer.init(error);
183  RealVector lastPoint = optimizer.solution().point;
184  while(norm_inf(optimizer.derivative()) > m_accuracy){
185  optimizer.step(error);
186  //if no progress has been made, something is wrong or we have numerical problems
187  //=> abort.
188  if(norm_sqr(optimizer.solution().point - lastPoint) == 0) break;
189  noalias(lastPoint) = optimizer.solution().point;
190  }
191  model.setParameterVector(lastPoint);
192  return;
193  }
194 
195  //l1-regularization is more painful.
196  //we transform the l1-regularization |w|
197  // by adding two sets of parameters, w=u-v , u >= 0, v>=0 and |w| = 1^Tu +1^Tv
198  // the resulting function is differentiable, however we have added constraints
199  L1Reformulation function(&error, m_lambda1, dim - m_bias * numOutputs);
200  LBFGS optimizer;
201  function.init();
202  optimizer.init(function);
203  RealVector lastPoint = optimizer.solution().point;
204  for(;;){
205  //check whether we are done
206  bool optimal= true;
207  auto const& derivative = optimizer.derivative();
208  for(std::size_t i = 0; i != lastPoint.size(); ++i){
209  if(lastPoint(i) < 1.e-13 && -derivative(i) > m_accuracy){//coordinate on constraint and derivative pushes away from constraint
210  optimal = false;
211  break;
212  }else if(lastPoint(i) > 1.e-13 && std::abs(derivative(i)) > m_accuracy){//free coordinate and derivative is not close to 0
213  optimal = false;
214  break;
215  }
216  }
217  if(optimal)
218  break;
219 
220 
221 
222  optimizer.step(function);
223  //if no progress has been made, something is wrong or we have numerical problems
224  //=> abort.
225  if(norm_sqr(optimizer.solution().point - lastPoint) == 0) break;
226  noalias(lastPoint) = optimizer.solution().point;
227  }
228 
229  std::size_t n = dim - m_bias * numOutputs;
230  //construct parameter vector from solution
231  RealVector param = (subrange(lastPoint,0,n) - subrange(lastPoint,n, 2 * n)) | subrange(lastPoint,2*n,lastPoint.size());
232  model.setParameterVector(param);
233  }
234 
235 private:
236  bool m_bias; ///< whether to train with the bias parameter or not
237  double m_lambda1; ///< l1-regularization parameter
238  double m_lambda2; ///< l2-regularization parameter
239  double m_accuracy; ///< gradient accuracy
240 
241  class L1Reformulation: public SingleObjectiveFunction{
242  public:
243  L1Reformulation(ErrorFunction* error, double lambda1, std::size_t regularizedParams)
244  : mep_error(error), m_lambda1(lambda1), m_regularizedParams(regularizedParams){
245  m_features |= CAN_PROPOSE_STARTING_POINT;
246  m_features |= HAS_FIRST_DERIVATIVE;
247 
248  std::size_t dim = numberOfVariables();
249  double unconstrained = 1e100;
250  RealVector lower(dim,0.0);
251  subrange(lower, 2 * m_regularizedParams,dim) = blas::repeat(-unconstrained,dim - 2 * m_regularizedParams);
252  RealVector upper(dim,unconstrained);
253  m_handler.setBounds(lower,upper);
254  announceConstraintHandler(&m_handler);
255  }
256 
257  SearchPointType proposeStartingPoint()const {
258  return RealVector(numberOfVariables(),0.0);
259  }
260 
261  std::size_t numberOfVariables()const{
262  return mep_error->numberOfVariables() + m_regularizedParams;
263  }
264 
265  double eval(RealVector const& input) const{
266  std::size_t dim = input.size();
267  std::size_t n = m_regularizedParams;
268  RealVector params = (subrange(input,0,n) - subrange(input,n, 2 * n)) | subrange(input,2*n,dim);
269  return mep_error->eval(params) + m_lambda1 * sum(subrange(input,0,2*n));
270  }
271  ResultType evalDerivative( const SearchPointType & input, FirstOrderDerivative & derivative ) const{
272  std::size_t dim = input.size();
273  std::size_t n = m_regularizedParams;
274  RealVector params = (subrange(input,0,n) - subrange(input,n, 2 * n)) | subrange(input,2*n,dim);
275  FirstOrderDerivative paramDerivative;
276  double error = mep_error->evalDerivative(params, paramDerivative);
277  derivative.resize(numberOfVariables());
278  noalias(subrange(derivative,0,n)) = m_lambda1 + subrange(paramDerivative,0,n);
279  noalias(subrange(derivative,n,2 * n)) = m_lambda1 - subrange(paramDerivative,0,n);
280  noalias(subrange(derivative,2 * n,dim)) = subrange(paramDerivative,n,dim - n);
281  return error + m_lambda1 * sum(subrange(input,0,2*n));
282  }
283 
284  private:
285  ErrorFunction* mep_error;
286  double m_lambda1;
288  std::size_t m_regularizedParams;
289  };
290 };
291 
292 
293 }
294 #endif