ArdKernel.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Gaussian automatic relevance detection (ARD) kernel
6  *
7  *
8  *
9  * \author T.Glasmachers, O. Krause, M. Tuma
10  * \date 2010-2012
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
36 #define SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
37 
38 
40 namespace shark {
41 
42 
43 /// \brief Automatic relevance detection kernel for unconstrained parameter optimization
44 ///
45 /// The automatic relevance detection (ARD) kernel is a general Gaussian kernel with
46 /// diagonal covariance matrix:
47 /// \f$ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f$.
48 /// The ARD kernel holds one real-valued parameter \f$ \gamma_i \f$ per input dimension.
49 /// The parameters \f$ p_i \f$ are encoded as \f$ p_i = log(\gamma_i) \f$, allowing for unconstrained
50 /// optimization. Here, the exposed/visible parameters are transformed by an exp before being used in the
51 /// actual computations.
52 ///
53 /// Note that, like all or most models/kernels designed for unconstrained optimization, the
54 /// argument to the constructor corresponds to the value of the true weights, while the set
55 /// and get methods for the parameter vector set the parameterized values and not the true weights.
56 ///
57 /// \todo slow default implementation. Use BLAS3 calls to make things faster
58 template<class InputType=RealVector>
60 {
61 private:
63 
64  struct InternalState: public State{
65  RealMatrix kxy;
66 
67  void resize(std::size_t sizeX1,std::size_t sizeX2){
68  kxy.resize(sizeX1,sizeX2);
69  }
70  };
71 public:
75 
76  /// Constructor
77  /// \param dim input dimension
78  /// \param gamma_init initial gamma value for all dimensions (true value, used as passed into ctor)
79  ARDKernelUnconstrained(unsigned int dim, double gamma_init = 1.0){
80  SHARK_RUNTIME_CHECK( gamma_init > 0, "[ARDKernelUnconstrained::ARDKernelUnconstrained] Expected positive weight.");
81 
82  //init abstract model's informational flags
86 
87  //initialize self
88  m_inputDimensions = dim;
90  for ( unsigned int i=0; i<m_inputDimensions; i++ ){
91  m_gammas(i) = gamma_init;
92  }
93  }
94 
95  /// \brief From INameable: return the class name.
96  std::string name() const
97  { return "ARDKernelUnconstrained"; }
98 
99  RealVector parameterVector() const{
100  return log(m_gammas);
101  }
102  void setParameterVector(RealVector const& newParameters){
103  SIZE_CHECK(newParameters.size() == m_inputDimensions);
104  noalias(m_gammas) = exp(newParameters);
105  }
106  std::size_t numberOfParameters() const{
107  return m_inputDimensions;
108  }
109 
110  ///\brief creates the internal state of the kernel
111  boost::shared_ptr<State> createState()const{
112  return boost::shared_ptr<State>(new InternalState());
113  }
114 
115  /// convenience methods for setting/getting the actual gamma values
116  RealVector gammaVector() const{
117  return m_gammas;
118  }
119  void setGammaVector( RealVector const& newGammas ) {
120 #ifndef DNDEBUG
121  SIZE_CHECK( newGammas.size() == m_inputDimensions );
122  for ( unsigned int i=0; i<m_inputDimensions; i++ ) {
123  RANGE_CHECK( newGammas(i) > 0 );
124  }
125 #endif
126  m_gammas = newGammas;
127  }
128 
129  /// \brief evaluates \f$ k(x,z)\f$
130  ///
131  /// ARD kernel evaluation
132  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
133  double eval(ConstInputReference x1, ConstInputReference x2) const{
134  SIZE_CHECK(x1.size() == x2.size());
135  SIZE_CHECK(x1.size() == m_inputDimensions);
136  double dmnorm2 = diagonalMahalanobisDistanceSqr(x1, x2, m_gammas);
137  return std::exp(-dmnorm2);
138  }
139 
140  /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
141  ///
142  /// ARD kernel evaluation
143  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
144  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{
145  SIZE_CHECK(batchX1.size2() == batchX2.size2());
146  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
147 
148  std::size_t sizeX1 = batchX1.size1();
149  std::size_t sizeX2 = batchX2.size1();
150 
151  ensure_size(result,sizeX1,sizeX2);
152  //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
153  for(std::size_t i = 0; i != sizeX1; ++i){
154  for(std::size_t j = 0; j != sizeX2; ++j){
155  double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
156  result(i,j)=std::exp(-dmnorm2);
157  }
158  }
159  }
160 
161  /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
162  ///
163  /// ARD kernel evaluation
164  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
165  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{
166  SIZE_CHECK(batchX1.size2() == batchX2.size2());
167  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
168 
169  std::size_t sizeX1 = batchX1.size1();
170  std::size_t sizeX2 = batchX2.size1();
171 
172  InternalState& s = state.toState<InternalState>();
173  s.resize(sizeX1,sizeX2);
174 
175  ensure_size(result,sizeX1,sizeX2);
176  //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
177  for(std::size_t i = 0; i != sizeX1; ++i){
178  for(std::size_t j = 0; j != sizeX2; ++j){
179  double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
180  result(i,j) = std::exp(-dmnorm2);
181  s.kxy(i,j) = result(i,j);
182  }
183  }
184  }
185 
186  /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial \sqrt{\gamma_i}}\f$ weighted over a whole batch
187  ///
188  /// Since the ARD kernel is parametrized for unconstrained optimization, we return
189  /// the derivative w.r.t. the parameters \f$ p_i \f$, where \f$ p_i^2 = \gamma_i \f$.
190  ///
191  /// \f[ \frac {\partial k(x,z)}{\partial p_i} = -2 p_i (x_i - z_i)^2 \cdot k(x,z) \f]
193  ConstBatchInputReference batchX1,
194  ConstBatchInputReference batchX2,
195  RealMatrix const& coefficients,
196  State const& state,
197  RealVector& gradient
198  ) const{
199  SIZE_CHECK(batchX1.size2() == batchX2.size2());
200  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
201 
202  std::size_t sizeX1 = batchX1.size1();
203  std::size_t sizeX2 = batchX2.size1();
204 
205  ensure_size(gradient, m_inputDimensions );
206  gradient.clear();
207  InternalState const& s = state.toState<InternalState>();
208 
209  for(std::size_t i = 0; i != sizeX1; ++i){
210  for(std::size_t j = 0; j != sizeX2; ++j){
211  double coeff = coefficients(i,j) * s.kxy(i,j);
212  gradient -= coeff * m_gammas * sqr(row(batchX1,i)-row(batchX2,j));
213  }
214  }
215  }
216 
217  /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial x}\f$
218  ///
219  /// first derivative of ARD kernel wrt the first input pattern
220  /// \f[ \frac {\partial k(x,z)}{\partial x} = -2 \gamma_i \left( x_i - z_i \right)\cdot k(x,z) \f]
222  ConstBatchInputReference batchX1,
223  ConstBatchInputReference batchX2,
224  RealMatrix const& coefficientsX2,
225  State const& state,
226  BatchInputType& gradient
227  ) const{
228  SIZE_CHECK(batchX1.size2() == batchX2.size2());
229  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
230 
231  std::size_t sizeX1 = batchX1.size1();
232  std::size_t sizeX2 = batchX2.size1();
233 
234  InternalState const& s = state.toState<InternalState>();
235  ensure_size(gradient, sizeX1, m_inputDimensions );
236  gradient.clear();
237 
238  for(std::size_t i = 0; i != sizeX1; ++i){
239  for(std::size_t j = 0; j != sizeX2; ++j){
240  double coeff = coefficientsX2(i,j) * s.kxy(i,j);
241  row(gradient,i) += coeff * m_gammas * (row(batchX1,i)-row(batchX2,j));
242  }
243  }
244  gradient *= -2.0;
245  }
246 
247  void read(InArchive& ar){
248  ar >> m_gammas;
249  ar >> m_inputDimensions;
250  }
251 
252  void write(OutArchive& ar) const{
253  ar << m_gammas;
254  ar << m_inputDimensions;
255  }
256 
257 protected:
258  RealVector m_gammas; ///< kernel bandwidth parameters, one for each input dimension.
259  std::size_t m_inputDimensions; ///< how many input dimensions = how many bandwidth parameters
260 };
261 
264 
265 }
266 #endif