CrossEntropy.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Error measure for classification tasks that can be used
6  * as the objective function for training.
7  *
8  *
9  *
10  *
11  * \author -
12  * \date -
13  *
14  *
15  * \par Copyright 1995-2017 Shark Development Team
16  *
17  * <BR><HR>
18  * This file is part of Shark.
19  * <http://shark-ml.org/>
20  *
21  * Shark is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published
23  * by the Free Software Foundation, either version 3 of the License, or
24  * (at your option) any later version.
25  *
26  * Shark is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29  * GNU Lesser General Public License for more details.
30  *
31  * You should have received a copy of the GNU Lesser General Public License
32  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33  *
34  */
35 
36 #ifndef SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
37 #define SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
38 
40 
41 namespace shark{
42 
43 /*!
44  * \brief Error measure for classification tasks that can be used
45  * as the objective function for training.
46  *
47  * If your model should return a vector whose components reflect the
48  * logarithmic conditional probabilities of class membership given any input vector
49  * 'CrossEntropy' is the adequate error measure for model-training.
50  * For \em C>1 classes the loss function is defined as
51  * \f[
52  * E = - \ln \frac{\exp{x_c}} {\sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}} = - x_c + \ln \sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}
53  * \f]
54  * where \em x is the prediction vector of the model and \em c is the class label. In the case of only one
55  * model output and binary classification, another more numerically stable formulation is used:
56  * \f[
57  * E = \ln(1+ e^{-yx})
58  * \f]
59  * here, \em y are class labels between -1 and 1 and y = -2 c+1. The reason why this is numerically more stable is,
60  * that when \f$ e^{-yx} \f$ is big, the error function is well approximated by the linear function \em x. Also if
61  * the exponential is very small, the case \f$ \ln(0) \f$ is avoided.
62  *
63  * The class labels must be integers starting from 0. Also for theoretical reasons, the output neurons of a neural
64  * Network must be linear.
65  */
66 class CrossEntropy : public AbstractLoss<unsigned int,RealVector>
67 {
68 private:
70 
71  //uses different formula to compute the binary case for 1 output.
72  //should be numerically more stable
73  //formula: ln(1+exp(-yx)) with y = -1/1
74  double evalError(double label,double exponential,double value) const {
75 
76  if(value*label < -200 ){
77  //below this, we might get numeric instabilities
78  //but we know, that ln(1+exp(x)) converges to x for big arguments
79  return - value * label;
80  }
81  return std::log(1+exponential);
82  }
83 public:
85  {
87  //~ m_features |= HAS_SECOND_DERIVATIVE;
88  }
89 
90 
91  /// \brief From INameable: return the class name.
92  std::string name() const
93  { return "CrossEntropy"; }
94 
95  // annoyingness of C++ templates
96  using base_type::eval;
97 
98  double eval(UIntVector const& target, RealMatrix const& prediction) const {
99  double error = 0;
100  for(std::size_t i = 0; i != prediction.size1(); ++i){
101  error += eval(target(i), row(prediction,i));
102  }
103  return error;
104  }
105 
106  double eval( ConstLabelReference target, ConstOutputReference prediction)const{
107  if ( prediction.size() == 1 )
108  {
109  RANGE_CHECK ( target < 2 );
110  double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
111  double exponential = std::exp( -label * prediction(0) );
112  return evalError(label,exponential,prediction(0 ));
113  }else{
114  RANGE_CHECK ( target < prediction.size() );
115 
116  //calculate the log norm in a numerically stable way
117  //we subtract the maximum prior to exponentiation to
118  //ensure that the exponentiation result will still fit in double
119  double maximum = max(prediction);
120  double logNorm = sum(exp(prediction-maximum));
121  logNorm = std::log(logNorm) + maximum;
122  return logNorm - prediction(target);
123  }
124  }
125 
126  double evalDerivative(UIntVector const& target, RealMatrix const& prediction, RealMatrix& gradient) const {
127  gradient.resize(prediction.size1(),prediction.size2());
128  if ( prediction.size2() == 1 )
129  {
130  double error = 0;
131  for(std::size_t i = 0; i != prediction.size1(); ++i){
132  RANGE_CHECK ( target(i) < 2 );
133  double label = 2 * static_cast<double>(target(i)) - 1; //converts labels from 0/1 to -1/1
134  double exponential = std::exp ( -label * prediction (i, 0 ) );
135  double sigmoid = 1.0/(1.0+exponential);
136  gradient ( i,0 ) = -label * (1.0 - sigmoid);
137  error+=evalError(label,exponential,prediction (i, 0 ));
138  }
139  return error;
140  }else{
141  double error = 0;
142  for(std::size_t i = 0; i != prediction.size1(); ++i){
143  RANGE_CHECK ( target(i) < prediction.size2() );
144  auto gradRow=row(gradient,i);
145 
146  //calculate the log norm in a numerically stable way
147  //we subtract the maximum prior to exponentiation to
148  //ensure that the exponentiation result will still fit in double
149  //this does not change the result as the values get normalized by
150  //their sum and thus the correction term cancels out.
151  double maximum = max(row(prediction,i));
152  noalias(gradRow) = exp(row(prediction,i) - maximum);
153  double norm = sum(gradRow);
154  gradRow/=norm;
155  gradient(i,target(i)) -= 1;
156  error+=std::log(norm) - prediction(i,target(i))+maximum;
157  }
158  return error;
159  }
160  }
161  double evalDerivative(ConstLabelReference target, ConstOutputReference prediction, OutputType& gradient) const {
162  gradient.resize(prediction.size());
163  if ( prediction.size() == 1 )
164  {
165  RANGE_CHECK ( target < 2 );
166  double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
167  double exponential = std::exp ( - label * prediction(0));
168  double sigmoid = 1.0/(1.0+exponential);
169  gradient(0) = -label * (1.0 - sigmoid);
170  return evalError(label,exponential,prediction(0));
171  }else{
172  RANGE_CHECK ( target < prediction.size() );
173 
174  //calculate the log norm in a numerically stable way
175  //we subtract the maximum prior to exponentiation to
176  //ensure that the exponentiation result will still fit in double
177  //this does not change the result as the values get normalized by
178  //their sum and thus the correction term cancels out.
179  double maximum = max(prediction);
180  noalias(gradient) = exp(prediction - maximum);
181  double norm = sum(gradient);
182  gradient /= norm;
183  gradient(target) -= 1;
184  return std::log(norm) - prediction(target) + maximum;
185  }
186  }
187 
189  ConstLabelReference target, ConstOutputReference prediction,
190  OutputType& gradient,MatrixType & hessian
191  ) const {
192  gradient.resize(prediction.size());
193  hessian.resize(prediction.size(),prediction.size());
194  if ( prediction.size() == 1 )
195  {
196  RANGE_CHECK ( target < 2 );
197  double label = 2 * static_cast<double>(target) - 1; //converts labels from 0/1 to -1/1
198  double exponential = std::exp ( -label * prediction ( 0 ) );
199  double sigmoid = 1.0/(1.0+exponential);
200  gradient ( 0 ) = -label * (1.0-sigmoid);
201  hessian ( 0,0 ) = sigmoid * ( 1-sigmoid );
202  return evalError(label,exponential,prediction ( 0 ));
203  }
204  else
205  {
206  RANGE_CHECK ( target < prediction.size() );
207  //calculate the log norm in a numerically stable way
208  //we subtract the maximum prior to exponentiation to
209  //ensure that the exponentiation result will still fit in double
210  //this does not change the result as the values get normalized by
211  //their sum and thus the correction term cancels out.
212  double maximum = max(prediction);
213  noalias(gradient) = exp(prediction-maximum);
214  double norm = sum(gradient);
215  gradient/=norm;
216 
217  noalias(hessian)=-outer_prod(gradient,gradient);
218  noalias(diag(hessian)) += gradient;
219  gradient(target) -= 1;
220 
221  return std::log(norm) - prediction(target) - maximum;
222  }
223  }
224 };
225 
226 
227 }
228 #endif