WeightedSumKernel.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Weighted sum of m_base kernels.
6  *
7  *
8  *
9  * \author T.Glasmachers, O. Krause, M. Tuma
10  * \date 2010, 2011
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_MODELS_KERNELS_WEIGHTED_SUM_KERNEL_H
36 #define SHARK_MODELS_KERNELS_WEIGHTED_SUM_KERNEL_H
37 
38 
40 
41 #include <boost/utility/enable_if.hpp>
42 namespace shark {
43 
44 
45 /// \brief Weighted sum of kernel functions
46 ///
47 /// For a set of positive definite kernels \f$ k_1, \dots, k_n \f$
48 /// with positive coeffitients \f$ w_1, \dots, w_n \f$ the sum
49 /// \f[ \tilde k(x_1, x_2) := \sum_{i=1}^{n} w_i \cdot k_i(x_1, x_2) \f]
50 /// is again a positive definite kernel function.
51 /// Internally, the weights are represented as
52 /// \f$ w_i = \exp(\xi_i) \f$
53 /// to allow for unconstrained optimization.
54 ///
55 /// This variant of the weighted sum kernel eleminates one redundant
56 /// degree of freedom by fixing the first kernel weight to 1.0.
57 ///
58 /// The result of the kernel evaluation is devided by the sum of the
59 /// kernel weights, so that in total, this amounts to fixing the sum
60 /// of the of the weights to one.
61 ///
62 template<class InputType=RealVector>
63 class WeightedSumKernel : public AbstractKernelFunction<InputType>
64 {
65 private:
67 
68  struct InternalState: public State{
69  RealMatrix result;
70  std::vector<RealMatrix> kernelResults;
71  std::vector<boost::shared_ptr<State> > kernelStates;
72 
73  InternalState(std::size_t numSubKernels)
74  :kernelResults(numSubKernels),kernelStates(numSubKernels){}
75 
76  void resize(std::size_t sizeX1, std::size_t sizeX2){
77  result.resize(sizeX1, sizeX2);
78  for(std::size_t i = 0; i != kernelResults.size(); ++i){
79  kernelResults[i].resize(sizeX1, sizeX2);
80  }
81  }
82  };
83 public:
87 
89  SHARK_RUNTIME_CHECK( base.size() > 0, "[WeightedSumKernel::WeightedSumKernel] There should be at least one sub-kernel.");
90 
91  m_base.resize( base.size() );
92  m_numParameters = m_base.size()-1;
93  m_adaptWeights = true;
94  for (std::size_t i=0; i != m_base.size() ; i++) {
95  SHARK_ASSERT( base[i] != NULL );
96  m_base[i].kernel = base[i];
97  m_base[i].weight = 1.0;
98  m_base[i].adaptive = false;
99  }
100  m_weightsum = (double)m_base.size();
101 
102  // set m_base class features
103  bool hasFirstParameterDerivative = true;
104  for ( unsigned int i=0; i<m_base.size(); i++ ){
105  if ( !m_base[i].kernel->hasFirstParameterDerivative() ) {
106  hasFirstParameterDerivative = false;
107  break;
108  }
109  }
110  bool hasFirstInputDerivative = true;
111  for ( unsigned int i=0; i<m_base.size(); i++ ){
112  if ( !m_base[i].kernel->hasFirstInputDerivative() ) {
113  hasFirstInputDerivative = false;
114  break;
115  }
116  }
117 
118  if ( hasFirstParameterDerivative )
120 
121  if ( hasFirstInputDerivative )
123  }
124 
125  /// \brief From INameable: return the class name.
126  std::string name() const
127  { return "WeightedSumKernel"; }
128 
129  /// Check whether m_base kernel index is adaptive.
130  bool isAdaptive( std::size_t index ) const {
131  return m_base[index].adaptive;
132  }
133  /// Set adaptivity of m_base kernel index.
134  void setAdaptive( std::size_t index, bool b = true ) {
135  m_base[index].adaptive = b;
137  }
138  /// Set adaptivity of all m_base kernels.
139  void setAdaptiveAll( bool b = true ) {
140  for (std::size_t i=0; i!= m_base.size(); i++)
141  m_base[i].adaptive = b;
143  }
144 
145  /// Get the weight of a kernel
146  double weight(std::size_t index){
147  RANGE_CHECK(index < m_base.size());
148  return m_base[index].weight;
149  }
150 
151  void setAdaptiveWeights(bool b){
152  m_adaptWeights = b;
153  }
154 
155  /// return the parameter vector. The first N-1 entries are the (log-encoded) kernel
156  /// weights, the sub-kernel's parameters are stacked behind each other after that.
157  RealVector parameterVector() const {
158  std::size_t num = numberOfParameters();
159  RealVector ret(num);
160 
161  std::size_t index = 0;
162  for (; index != m_base.size()-1; index++){
163  ret(index) = std::log(m_base[index+1].weight);
164 
165  }
166  for (std::size_t i=0; i != m_base.size(); i++){
167  if (m_base[i].adaptive){
168  std::size_t n = m_base[i].kernel->numberOfParameters();
169  subrange(ret,index,index+n) = m_base[i].kernel->parameterVector();
170  index += n;
171  }
172  }
173  return ret;
174  }
175 
176  ///\brief creates the internal state of the kernel
177  boost::shared_ptr<State> createState()const{
178  InternalState* state = new InternalState(m_base.size());
179  for(std::size_t i = 0; i != m_base.size(); ++i){
180  state->kernelStates[i]=m_base[i].kernel->createState();
181  }
182  return boost::shared_ptr<State>(state);
183  }
184 
185  /// set the parameter vector. The first N-1 entries are the (log-encoded) kernel
186  /// weights, the sub-kernel's parameters are stacked behind each other after that.
187  void setParameterVector(RealVector const& newParameters) {
188  SIZE_CHECK(newParameters.size() == numberOfParameters());
189 
190  m_weightsum = 1.0;
191  std::size_t index = 0;
192  for (; index != m_base.size()-1; index++){
193  double w = newParameters(index);
194  m_base[index+1].weight = std::exp(w);
195  m_weightsum += m_base[index+1].weight;
196 
197  }
198  for (std::size_t i=0; i != m_base.size(); i++){
199  if (m_base[i].adaptive){
200  std::size_t n = m_base[i].kernel->numberOfParameters();
201  m_base[i].kernel->setParameterVector(subrange(newParameters,index,index+n));
202  index += n;
203  }
204  }
205  }
206 
207  std::size_t numberOfParameters() const {
208  return m_numParameters;
209  }
210 
211  /// Evaluate the weighted sum kernel (according to the following equation:)
212  /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
213  double eval(ConstInputReference x1, ConstInputReference x2) const{
214  double numerator = 0.0;
215  for (std::size_t i=0; i != m_base.size(); i++){
216  double result = m_base[i].kernel->eval(x1, x2);
217  numerator += m_base[i].weight*result;
218  }
219  return numerator / m_weightsum;
220  }
221 
222  /// Evaluate the kernel according to the equation:
223  /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
224  /// for two batches of inputs.
225  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{
226  std::size_t sizeX1 = batchSize(batchX1);
227  std::size_t sizeX2 = batchSize(batchX2);
228  ensure_size(result,sizeX1,sizeX2);
229  result.clear();
230 
231  RealMatrix kernelResult(sizeX1,sizeX2);
232  for (std::size_t i = 0; i != m_base.size(); i++){
233  m_base[i].kernel->eval(batchX1, batchX2,kernelResult);
234  result += m_base[i].weight*kernelResult;
235  }
236  result /= m_weightsum;
237  }
238 
239  /// Evaluate the kernel according to the equation:
240  /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
241  /// for two batches of inputs.
242  /// (see the documentation of numberOfIntermediateValues for the workings of the intermediates)
243  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{
244  std::size_t sizeX1 = batchSize(batchX1);
245  std::size_t sizeX2 = batchSize(batchX2);
246  ensure_size(result,sizeX1,sizeX2);
247  result.clear();
248 
249  InternalState& s = state.toState<InternalState>();
250  s.resize(sizeX1,sizeX2);
251 
252  for (std::size_t i=0; i != m_base.size(); i++){
253  m_base[i].kernel->eval(batchX1,batchX2,s.kernelResults[i],*s.kernelStates[i]);
254  result += m_base[i].weight*s.kernelResults[i];
255  }
256  //store summed result
257  s.result=result;
258  result /= m_weightsum;
259  }
260 
262  ConstBatchInputReference batchX1,
263  ConstBatchInputReference batchX2,
264  RealMatrix const& coefficients,
265  State const& state,
266  RealVector& gradient
267  ) const{
268  ensure_size(gradient,numberOfParameters());
269 
270  std::size_t numKernels = m_base.size(); //how far the loop goes;
271 
272  InternalState const& s = state.toState<InternalState>();
273 
274  double sumSquared = sqr(m_weightsum); //denominator
275 
276  //first the derivative with respect to the (log-encoded) weight parameter
277  //the first weight is not a parameter and does not need a gradient.
278  //[Theoretically, we wouldn't need to store its result .]
279  //calculate the weighted sum over all results
280  double numeratorSum = sum(coefficients * s.result);
281  for (std::size_t i = 1; i != numKernels && m_adaptWeights; i++) {
282  //calculate the weighted sum over all results of this kernel
283  double summedK=sum(coefficients * s.kernelResults[i]);
284  gradient(i-1) = m_base[i].weight * (summedK * m_weightsum - numeratorSum) / sumSquared;
285  }
286 
287  std::size_t gradPos = m_adaptWeights ? numKernels-1: 0; //starting position of subkerel gradient
288  RealVector kernelGrad;
289  for (std::size_t i=0; i != numKernels; i++) {
290  if (isAdaptive(i)){
291  double coeff = m_base[i].weight / m_weightsum;
292  m_base[i].kernel->weightedParameterDerivative(batchX1,batchX2,coefficients,*s.kernelStates[i],kernelGrad);
293  std::size_t n = kernelGrad.size();
294  noalias(subrange(gradient,gradPos,gradPos+n)) = coeff * kernelGrad;
295  gradPos += n;
296  }
297  }
298  }
299 
300  /// Input derivative, calculated according to the equation:
301  /// <br/>
302  /// \f$ \frac{\partial k(x, y)}{\partial x}
303  /// \frac{\sum_i \exp(w_i) \frac{\partial k_i(x, y)}{\partial x}}
304  /// {\sum_i exp(w_i)} \f$
305  /// and summed up over all of the second batch
307  ConstBatchInputReference batchX1,
308  ConstBatchInputReference batchX2,
309  RealMatrix const& coefficientsX2,
310  State const& state,
311  BatchInputType& gradient
312  ) const{
313  SIZE_CHECK(coefficientsX2.size1() == batchSize(batchX1));
314  SIZE_CHECK(coefficientsX2.size2() == batchSize(batchX2));
315  weightedInputDerivativeImpl<BatchInputType>(batchX1,batchX2,coefficientsX2,state,gradient);
316  }
317 
318  void read(InArchive& ar){
319  for(std::size_t i = 0;i != m_base.size(); ++i ){
320  ar >> m_base[i].weight;
321  ar >> m_base[i].adaptive;
322  ar >> *(m_base[i].kernel);
323  }
324  ar >> m_weightsum;
325  ar >> m_numParameters;
326  }
327  void write(OutArchive& ar) const{
328  for(std::size_t i=0;i!= m_base.size();++i){
329  ar << m_base[i].weight;
330  ar << m_base[i].adaptive;
331  ar << const_cast<AbstractKernelFunction<InputType> const&>(*(m_base[i].kernel));
332  }
333  ar << m_weightsum;
334  ar << m_numParameters;
335  }
336 
337 protected:
338  /// structure describing a single m_base kernel
339  struct tBase
340  {
341  AbstractKernelFunction<InputType>* kernel; ///< pointer to the m_base kernel object
342  double weight; ///< weight in the linear combination
343  bool adaptive; ///< whether the parameters of the kernel are part of the WeightedSumKernel's parameter vector?
344  };
345 
347  m_numParameters = m_adaptWeights? m_base.size()-1 : 0;
348  for (std::size_t i=0; i != m_base.size(); i++)
349  if (m_base[i].adaptive)
350  m_numParameters += m_base[i].kernel->numberOfParameters();
351  }
352 
353  //we need to choose the correct implementation at compile time to ensure, that in the case, InputType
354  //does not implement the needed operations, the implementation is replaced by a safe default which throws an exception
355  //for this, we use enable_if/disable_if. The method is called with BatchInputType as template argument.
356  //real implementation first.
357  template <class T>
359  ConstBatchInputReference batchX1,
360  ConstBatchInputReference batchX2,
361  RealMatrix const& coefficientsX2,
362  State const& state,
363  BatchInputType& gradient,
364  typename boost::enable_if<boost::is_same<T,RealMatrix > >::type* dummy = 0
365  )const{
366  std::size_t numKernels = m_base.size(); //how far the loop goes;
367  InternalState const& s = state.toState<InternalState>();
368 
369 
370  //initialize gradient with the first kernel
371  m_base[0].kernel->weightedInputDerivative(batchX1, batchX2, coefficientsX2, *s.kernelStates[0], gradient);
372  gradient *= m_base[0].weight / m_weightsum;
373  BatchInputType kernelGrad;
374  for (std::size_t i=1; i != numKernels; i++){
375  m_base[i].kernel->weightedInputDerivative(batchX1, batchX2, coefficientsX2, *s.kernelStates[i], kernelGrad);
376  double coeff = m_base[i].weight / m_weightsum;
377  gradient += coeff * kernelGrad;
378  }
379  }
380  template <class T>
382  ConstBatchInputReference batchX1,
383  ConstBatchInputReference batchX2,
384  RealMatrix const& coefficientsX2,
385  State const& state,
386  BatchInputType& gradient,
387  typename boost::disable_if<boost::is_same<T,RealMatrix > >::type* dummy = 0
388  )const{
389  throw SHARKEXCEPTION("[WeightedSumKernel::weightdInputDerivative] The used BatchInputType is no Vector");
390  }
391 
392  std::vector<tBase> m_base; ///< collection of m_base kernels
393  double m_weightsum; ///< sum of all weights
394  std::size_t m_numParameters; ///< total number of parameters
395  bool m_adaptWeights; ///< whether the weights should be adapted
396 };
397 
400 
401 }
402 #endif