NegativeGaussianProcessEvidence.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Evidence for model selection of a regularization network/Gaussian process.
6 
7 
8  *
9  *
10  * \author C. Igel, T. Glasmachers, O. Krause
11  * \date 2007-2012
12  *
13  *
14  * \par Copyright 1995-2017 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://shark-ml.org/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 #ifndef SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
37 #define SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
38 
41 
42 #include <shark/LinAlg/Base.h>
43 namespace shark {
44 
45 
46 ///
47 /// \brief Evidence for model selection of a regularization network/Gaussian process.
48 ///
49 /// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
50 /// \f$t\f$ the corresponding label vector. For the evidence we have:
51 /// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi)] \f]
52 ///
53 /// The evidence is also known as marginal (log)likelihood. For
54 /// details, please see:
55 ///
56 /// C.E. Rasmussen & C.K.I. Williams, Gaussian
57 /// Processes for Machine Learning, section 5.4, MIT Press, 2006
58 ///
59 /// C.M. Bishop, Pattern Recognition and Machine Learning, section
60 /// 6.4.3, Springer, 2006
61 ///
62 /// The regularization parameter can be encoded in different ways.
63 /// The exponential encoding is the proper choice for unconstraint optimization.
64 /// Be careful not to mix up different encodings between trainer and evidence.
65 template<class InputType = RealVector, class OutputType = RealVector, class LabelType = RealVector>
67 {
68 public:
71 
72  /// \param dataset: training data for the Gaussian process
73  /// \param kernel: pointer to external kernel function
74  /// \param unconstrained: exponential encoding of regularization parameter for unconstraint optimization
76  DatasetType const& dataset,
77  KernelType* kernel,
78  bool unconstrained = false
79  ): m_dataset(dataset)
80  , mep_kernel(kernel)
81  , m_unconstrained(unconstrained)
82  {
84  setThreshold(0.);
85  }
86 
87  /// \brief From INameable: return the class name.
88  std::string name() const
89  { return "NegativeGaussianProcessEvidence"; }
90 
91  std::size_t numberOfVariables()const{
92  return 1+ mep_kernel->numberOfParameters();
93  }
94 
95  /// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
96  /// \f$t\f$ the label vector. For the evidence we have: \f[ E= 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
97  double eval(const RealVector& parameters) const {
98  std::size_t N = m_dataset.numberOfElements();
99  std::size_t kp = mep_kernel->numberOfParameters();
100  // check whether argument has right dimensionality
101  SHARK_ASSERT(1+kp == parameters.size());
102 
103  // keep track of how often the objective function is called
105 
106  //set parameters
107  double betaInv = parameters.back();
108  if(m_unconstrained)
109  betaInv = std::exp(betaInv); // for unconstraint optimization
110  mep_kernel->setParameterVector(subrange(parameters,0,kp));
111 
112 
113  //generate kernel matrix and label vector
114  RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
115  RealVector t = column(createBatch<RealVector>(m_dataset.labels().elements()),0);
116 
117  blas::cholesky_decomposition<RealMatrix> cholesky(M);
118 
119  //compute the determinant of M using the cholesky factorization M=AA^T:
120  //ln det(M) = 2 trace(ln A)
121  double logDet = 2* trace(log(cholesky.lower_factor()));
122 
123  //we need to compute t^T M^-1 t
124  //= t^T (AA^T)^-1 t= t^T (A^-T A^-1)=||A^-1 t||^2
125  //so we will first solve the triangular System Az=t
126  //and then compute ||z||^2
127  RealVector z = solve(cholesky.lower_factor(),t,blas::lower(), blas::left());
128 
129  // equation (6.69) on page 311 in the book C.M. Bishop, Pattern Recognition and Machine Learning, Springer, 2006
130  // e = 1/2 \cdot [ -log(det(M)) - t^T M^{-1} t - N log(2 \pi) ]
131  double e = 0.5 * (-logDet - norm_sqr(z) - N * std::log(2.0 * M_PI));
132 
133  // return the *negative* evidence
134  return -e;
135  }
136 
137  /// Let \f$M\f$ denote the regularized (kernel Gram) covariance matrix.
138  /// For the evidence we have:
139  /// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
140  /// For a kernel parameter \f$p\f$ and \f$C = \beta^{-1}\f$ we get the derivatives:
141  /// \f[ dE/dC = 1/2 \cdot [ -tr(M^{-1}) + (M^{-1} t)^2 ] \f]
142  /// \f[ dE/dp = 1/2 \cdot [ -tr(M^{-1} dM/dp) + t^T (M^{-1} dM/dp M^{-1}) t ] \f]
143  double evalDerivative(const RealVector& parameters, FirstOrderDerivative& derivative) const {
144  std::size_t N = m_dataset.numberOfElements();
145  std::size_t kp = mep_kernel->numberOfParameters();
146 
147  // check whether argument has right dimensionality
148  SHARK_ASSERT(1 + kp == parameters.size());
149  derivative.resize(1 + kp);
150 
151  // keep track of how often the objective function is called
153 
154  //set parameters
155  double betaInv = parameters.back();
156  if(m_unconstrained)
157  betaInv = std::exp(betaInv); // for unconstraint optimization
158  mep_kernel->setParameterVector(subrange(parameters,0,kp));
159 
160  //generate kernel matrix and label vector
161  RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
162  RealVector t = column(createBatch<RealVector>(m_dataset.labels().elements()),0);
163 
164  //compute cholesky decomposition of M
165  blas::cholesky_decomposition<RealMatrix> cholesky(M);
166  //we dont need M anymore, so save a lot of memory by freeing the memory of M
167  M=RealMatrix();
168 
169  // compute derivative w.r.t. kernel parameters
170  //the derivative is defined as:
171  //dE/da = -tr(IM dM/da) +t^T IM dM/da IM t
172  // where IM is the inverse matrix of M, tr is the trace and a are the parameters of the kernel
173  //by substituting z = IM t we can expand the operations to:
174  //dE/da = -(sum_i sum_j IM_ij * dM_ji/da)+(sum_i sum_j dM_ij/da *z_i * z_j)
175  // = sum_i sum_j (-IM_ij+z_i * z_j) * dM_ij/da
176  // with W = -IM + zz^T we get
177  // dE/da = sum_i sum_j W dM_ij/da
178  //this can be calculated as blockwise derivative.
179 
180  //compute inverse matrix from the cholesky decomposition
181  RealMatrix W= blas::identity_matrix<double>(N);
182  cholesky.solve(W,blas::left());
183 
184  //calculate z = Wt=M^-1 t
185  RealVector z = prod(W,t);
186 
187  // W is already initialized as the inverse of M, so we only need
188  // to change the sign and add z. to calculate W fully
189  W*=-1;
190  noalias(W) += outer_prod(z,z);
191 
192 
193  //now calculate the derivative
194  RealVector kernelGradient = 0.5*calculateKernelMatrixParameterDerivative(*mep_kernel,m_dataset.inputs(),W);
195 
196  // compute derivative w.r.t. regularization parameter
197  //we have: dE/dC = 1/2 * [ -tr(M^{-1}) + (M^{-1} t)^2
198  // which can also be written as 1/2 tr(W)
199  double betaInvDerivative = 0.5 * trace(W) ;
200  if(m_unconstrained)
201  betaInvDerivative *= betaInv;
202 
203  //merge both derivatives and since we return the negative evidence, multiply with -1
204  noalias(derivative) = - (kernelGradient | betaInvDerivative);
205 
206  // truncate gradient vector
207  for(std::size_t i=0; i<derivative.size(); i++)
208  if(std::abs(derivative(i)) < m_derivativeThresholds(i)) derivative(i) = 0;
209 
210  // compute the evidence
211  //compute determinant of M (see eval for why this works)
212  double logDetM = 2* trace(log(cholesky.lower_factor()));
213  double e = 0.5 * (-logDetM - inner_prod(t, z) - N * std::log(2.0 * M_PI));
214  return -e;
215  }
216 
217  /// set threshold value for truncating partial derivatives
218  void setThreshold(double d) {
219  m_derivativeThresholds = RealVector(mep_kernel->numberOfParameters() + 1, d); // plus one parameter for the prior
220  }
221 
222  /// set threshold values for truncating partial derivatives
223  void setThresholds(RealVector &c) {
224  SHARK_ASSERT(m_derivativeThresholds.size() == c.size());
225  m_derivativeThresholds = c;
226  }
227 
228 
229 private:
230  /// pointer to external data set
231  DatasetType m_dataset;
232 
233  /// thresholds for setting derivatives to zero
234  RealVector m_derivativeThresholds;
235 
236  /// pointer to external kernel function
237  KernelType* mep_kernel;
238 
239  /// Indicates whether log() of the regularization parameter is
240  /// considered. This is useful for unconstraint
241  /// optimization. The default value is false.
242  bool m_unconstrained;
243 };
244 
245 
246 }
247 #endif