VariationalAutoencoderError.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Variational-autoencoder error function
5  *
6  *
7  *
8  * \author O.Krause
9  * \date 2017
10  *
11  *
12  * \par Copyright 1995-2017 Shark Development Team
13  *
14  * <BR><HR>
15  * This file is part of Shark.
16  * <http://shark-ml.org/>
17  *
18  * Shark is free software: you can redistribute it and/or modify
19  * it under the terms of the GNU Lesser General Public License as published
20  * by the Free Software Foundation, either version 3 of the License, or
21  * (at your option) any later version.
22  *
23  * Shark is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  * GNU Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License
29  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30  *
31  */
32 #ifndef SHARK_OBJECTIVEFUNCTIONS_NEGATIVE_LOG_LIKELIHOOD_H
33 #define SHARK_OBJECTIVEFUNCTIONS_NEGATIVE_LOG_LIKELIHOOD_H
34 
37 #include <shark/Core/Random.h>
38 
39 namespace shark{
40 
41 /// \brief Computes the variational autoencoder error function
42 ///
43 /// We want to optimize a model \f$ p(x) = \int p(x|z) p(z) dz \f$ where we choose p(z) as a multivariate normal distribution
44 /// and p(x|z) is an arbitrary model, e.g. a deep neural entwork. The naive solution is sampling from p(z) and then compute the sample
45 /// average. This will fail when p(z|x) is a very localized distribution and we might need many samples from p(z) to find a sample which is likely under
46 /// p(z|x). p(z|x) is assumed to be intractable to compute, so we introduce a second model q(z|x), modeling p(z|x) and we want to train
47 /// it such that it learns the unknown p(z|x). For this a variational lower bound on the likelihood is used and we maximize
48 /// \f[ log p(x) \leq E_{q(z|x)}[\log p(x|z)] - KL[q(z|x) || p(z)] \f]
49 /// The first term explains the meaning of variational autoencoder: we first sample z given x using the encoder model q and then decode
50 /// z to obtain an estimate for x. The only difference to normal autoencoders is that we now have a probabilistic z. The second term ensures that
51 /// q is learning p(z|x), assuming that we have enough modeling capacity to actually learn it.
52 /// See https://arxiv.org/abs/1606.05908 for more background.
53 ///
54 /// Implementation notice: we assume q(z|x) to be a set of independent gaussian distributions parameterized as
55 /// \f$ q(z| mu(x), \log \sigma^2(x)) \f$.
56 /// The provided encoder model q must therefore have twice as many outputs as the decvoder has inputs as
57 /// the second half of outputs is interpreted as the log of the variance. So if z should be a 100 dimensional variable, q must have 200
58 /// outputs. The outputs and loss function used for the encoder p is arbitrary, but a SquaredLoss will work well, however also other losses
59 /// like pixel probabilities can be used.
61 {
62 public:
64 
66  DatasetType const& data,
70  ):mep_decoder(decoder), mep_encoder(encoder), mep_loss(visible_loss), m_data(data){
71  if(mep_decoder->hasFirstParameterDerivative() && mep_encoder->hasFirstParameterDerivative())
75  }
76 
77  /// \brief From INameable: return the class name.
78  std::string name() const
79  { return "VariationalAutoencoderError"; }
80 
82  return mep_decoder->parameterVector() | mep_encoder->parameterVector();
83  }
84 
85  std::size_t numberOfVariables()const{
86  return mep_decoder->numberOfParameters() + mep_encoder->numberOfParameters();
87  }
88 
89  ResultType eval(RealVector const& parameters) const{
90  SIZE_CHECK(parameters.size() == numberOfVariables());
92  mep_decoder->setParameterVector(subrange(parameters,0,mep_decoder->numberOfParameters()));
93  mep_encoder->setParameterVector(subrange(parameters,mep_decoder->numberOfParameters(), numberOfVariables()));
94 
95  auto const& batch = m_data.batch(random::discrete(*mep_rng, std::size_t(0), m_data.numberOfBatches() -1));
96  RealMatrix hiddenResponse = (*mep_encoder)(batch);
97  auto const& mu = columns(hiddenResponse,0,hiddenResponse.size2()/2);
98  auto const& log_var = columns(hiddenResponse,hiddenResponse.size2()/2, hiddenResponse.size2());
99  //compute kulback leibler divergence term
100  double klError = 0.5 * (sum(exp(log_var)) + sum(sqr(mu)) - mu.size1() * mu.size2() - sum(log_var));
101  //sample random point from distribution
102  RealMatrix epsilon(mu.size1(), mu.size2());
103  for(std::size_t i = 0; i != epsilon.size1(); ++i){
104  for(std::size_t j = 0; j != epsilon.size2(); ++j){
105  epsilon(i,j) = random::gauss(*mep_rng,0,1);
106  }
107  }
108  RealMatrix z = mu + exp(0.5*log_var) * epsilon;
109  //reconstruct and compute reconstruction error
110  RealMatrix reconstruction = (*mep_decoder)(z);
111  return ((*mep_loss)(batch, reconstruction) + klError) / batch.size1();
112  }
113 
114 
116  SearchPointType const& parameters,
117  FirstOrderDerivative & derivative
118  ) const{
119  SIZE_CHECK(parameters.size() == numberOfVariables());
121  mep_decoder->setParameterVector(subrange(parameters,0,mep_decoder->numberOfParameters()));
122  mep_encoder->setParameterVector(subrange(parameters,mep_decoder->numberOfParameters(), numberOfVariables()));
123 
124  boost::shared_ptr<State> stateEncoder = mep_encoder->createState();
125  boost::shared_ptr<State> stateDecoder = mep_decoder->createState();
126  auto const& batch = m_data.batch(random::discrete(*mep_rng, std::size_t(0), m_data.numberOfBatches() -1));
127  RealMatrix hiddenResponse;
128  mep_encoder->eval(batch,hiddenResponse,*stateEncoder);
129  auto const& mu = columns(hiddenResponse,0,hiddenResponse.size2()/2);
130  auto const& log_var = columns(hiddenResponse,hiddenResponse.size2()/2, hiddenResponse.size2());
131  //compute kulback leibler divergence term
132  double klError = 0.5 * (sum(exp(log_var)) + sum(sqr(mu)) - mu.size1() * mu.size2() - sum(log_var));
133  RealMatrix klDerivative = mu | (0.5 * exp(log_var) - 0.5);
134  RealMatrix epsilon(mu.size1(), mu.size2());
135  for(std::size_t i = 0; i != epsilon.size1(); ++i){
136  for(std::size_t j = 0; j != epsilon.size2(); ++j){
137  epsilon(i,j) = random::gauss(*mep_rng,0,1);
138  }
139  }
140  RealMatrix z = mu + exp(0.5*log_var) * epsilon;
141  RealMatrix reconstructions;
142  mep_decoder->eval(z,reconstructions, *stateDecoder);
143 
144 
145  //compute loss derivative
146  RealMatrix lossDerivative;
147  double recError = mep_loss->evalDerivative(batch,reconstructions,lossDerivative);
148  //backpropagate error from the reconstruction loss to the Decoder
149  RealVector derivativeDecoder;
150  RealMatrix backpropDecoder;
151  mep_decoder->weightedDerivatives(z,reconstructions, lossDerivative,*stateDecoder, derivativeDecoder, backpropDecoder);
152 
153  //compute coefficients of the backprop from mep_decoder and the KL-term
154  RealMatrix backprop=(backpropDecoder | (backpropDecoder * 0.5*(z - mu))) + klDerivative;
155  RealVector derivativeEncoder;
156  mep_encoder->weightedParameterDerivative(batch,hiddenResponse, backprop,*stateEncoder, derivativeEncoder);
157 
158  derivative.resize(numberOfVariables());
159  noalias(derivative) = derivativeDecoder|derivativeEncoder;
160  derivative /= batch.size1();
161  return (recError + klError) / batch.size1();
162  }
163 
164 private:
169 };
170 
171 }
172 #endif