LMCMA.h
Go to the documentation of this file.
1 /*!
2  * \brief -
3  *
4  * \author Thomas Voss and Christian Igel
5  * \date April 2014
6  *
7  * \par Copyright 1995-2017 Shark Development Team
8  *
9  * <BR><HR>
10  * This file is part of Shark.
11  * <http://shark-ml.org/>
12  *
13  * Shark is free software: you can redistribute it and/or modify
14  * it under the terms of the GNU Lesser General Public License as published
15  * by the Free Software Foundation, either version 3 of the License, or
16  * (at your option) any later version.
17  *
18  * Shark is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21  * GNU Lesser General Public License for more details.
22  *
23  * You should have received a copy of the GNU Lesser General Public License
24  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
25  *
26  */
27 #ifndef SHARK_ALGORITHMS_DIRECT_SEARCH_LM_CMA_H
28 #define SHARK_ALGORITHMS_DIRECT_SEARCH_LM_CMA_H
29 
33 
37 
38 
39 
40 namespace shark {
41 
42 namespace detail{
43 
44 ///\brief Approximates a Limited Memory Cholesky Matrix from a stream of samples.
45 ///
46 /// Given a set of points \f$ v_i\f$, produces an approximation of the cholesky factor of a matrix:
47 /// \f[ AA^T=C= (1-\alpha) C^{t-1} + \alpha* x_{j_t} x_{j_t}^T \f]
48 /// here the \f$ j_t \f$ are chosen such to have an approximate distance \f$ N_{steps} \f$. It is assumed
49 /// that the \f$x_i \f$ are correlated and thus a big \f$ N_{steps} \f$ tris to get points which are less
50 /// correlated. The matrix keeps a set of vectors and decides at every step which is will discard.
51 ///
52 /// This is the corrected algorithm as proposed in
53 /// Ilya Loshchilov, "A Computationally Efficient Limited Memory CMA-ES for Large Scale Optimization"
54 ///
55 class IncrementalCholeskyMatrix{
56 public:
57  IncrementalCholeskyMatrix(){}
58  void init (double alpha,std::size_t dimensions, std::size_t numVectors, std::size_t Nsteps){
59  m_vArr.resize(numVectors,dimensions);
60  m_pcArr.resize(numVectors,dimensions);
61  m_b.resize(numVectors);
62  m_d.resize(numVectors);
63  m_l.resize(numVectors);
64  m_j.resize(0);//nothing stored at the bginning
65  m_Nsteps = Nsteps;
66  m_maxStoredVectors = numVectors;
67  m_counter = 0;
68  m_alpha = alpha;
69 
70  m_vArr.clear();
71  m_pcArr.clear();
72  m_b.clear();
73  m_d.clear();
74  m_l.clear();
75  }
76 
77  //computes x = Az
78  template<class T>
79  void prod(RealVector& x, T const& z)const{
80  x = z;
81  double a = std::sqrt(1-m_alpha);
82  for(std::size_t j=0; j != m_j.size(); j++){
83  std::size_t jcur = m_j[j];
84  double k = m_b(jcur) *inner_prod(row(m_vArr,jcur),z);
85  noalias(x) = a*x+k*row(m_pcArr,jcur);
86  }
87  }
88 
89  //computes x= A^{-1}z
90  template<class T>
91  void inv(RealVector& x, T const& z)const{
92  inv(x,z,m_j.size());
93  }
94 
95  void update(RealVector const& newPc){
96  std::size_t imin = 0;//the index of the removed point
97  if (m_j.size() < m_maxStoredVectors)
98  {
99  std::size_t index = m_j.size();
100  m_j.push_back(index);
101  imin = index;
102  }
103  else
104  {
105  //find the largest "age"gap between neighbouring points (i.e. the time between insertion)
106  //we want to remove the smallest gap as to make the
107  //time distances as equal as possible
108  std::size_t dmin = m_l[m_j[1]] - m_l[m_j[0]];
109  imin = 1;
110  for(std::size_t j=2; j != m_j.size(); j++)
111  {
112  std::size_t dcur = m_l[m_j[j]] - m_l[m_j[j-1]];
113  if (dcur < dmin)
114  {
115  dmin = dcur;
116  imin = j;
117  }
118  }
119  //if the gap is bigger than Nsteps, we remove the oldest point to
120  //shrink it.
121  if (dmin >= m_Nsteps)
122  imin = 0;
123  //we push all points backwards and append the freed index to the end of the list
124  if (imin != m_j.size()-1)
125  {
126  std::size_t sav = m_j[imin];
127  for(std::size_t j = imin; j != m_j.size()-1; j++)
128  m_j[j] = m_j[j+1];
129  m_j.back() = sav;
130  }
131  }
132  //set the values of the new added index
133  int newidx = m_j.back();
134  m_l[newidx] = m_counter;
135  noalias(row(m_pcArr,newidx)) = newPc;
136  ++m_counter;
137 
138  // this procedure recomputes v vectors correctly, in the original LM-CMA-ES they were outdated/corrupted.
139  // all vectors v_k,v_{k+1},...,v_m are corrupted where k=j_imin. it also computes the proper v and b/d values for the newest
140  // inserted vector
141  RealVector v;
142  for(std::size_t i = imin; i != m_j.size(); ++i)
143  {
144  int index = m_j[i];
145  inv(v,row(m_pcArr,index),i);
146  noalias(row(m_vArr,index)) = v;
147 
148  double normv2 = norm_sqr(row(m_vArr,index));
149  double c = std::sqrt(1.0-m_alpha);
150  double f = std::sqrt(1+m_alpha/(1-m_alpha)*normv2);
151  m_b[index] = c/normv2*(f-1);
152  m_d[index] = 1/(c*normv2)*(1-1/f);
153  }
154  }
155 
156 private:
157  template<class T>
158  void inv(RealVector& x, T const& z,std::size_t k)const{
159  x = z;
160  double c= 1.0/std::sqrt(1-m_alpha);
161  for(std::size_t j=0; j != k; j++){// O(m*n)
162  std::size_t jcur = m_j[j];
163  double k = m_d(jcur) * inner_prod(row(m_vArr,jcur),x);
164  noalias(x) = c*x - k*row(m_vArr,jcur);
165  }
166  }
167 
168  //variables making up A
169  RealMatrix m_vArr;
170  RealMatrix m_pcArr;
171  RealVector m_b;
172  RealVector m_d;
173 
174  //index variables for computation of A
175  std::vector<std::size_t> m_j;
176  std::vector<std::size_t> m_l;
177  std::size_t m_Nsteps;
178  std::size_t m_maxStoredVectors;
179  std::size_t m_counter;
180 
181  double m_alpha;
182 };
183 }
184 
185 /// \brief Implements a Limited-Memory-CMA
186 ///
187 /// This is the algorithm as proposed in
188 /// Ilya Loshchilov, "A Computationally Efficient Limited Memory CMA-ES for Large Scale Optimization"
189 /// with a few corrections regarding the covariance matrix update.
190 ///
191 /// The algorithm stores a subset of previous evolution path vectors and approximates the covariance
192 /// matrix based on this. This algorithm only requires O(nm) memory, where n is the dimensionality
193 /// and n the problem dimensionality. To be more exact, 2*m vectors of size n are stored to calculate
194 /// the matrix-vector product with the choelsky factor of the covariance matrix in O(mn).
195 ///
196 /// The algorithm uses the population based step size adaptation strategy as proposed in
197 /// the same paper.
198 class LMCMA: public AbstractSingleObjectiveOptimizer<RealVector >
199 {
200 public:
201  /// \brief Default c'tor.
202  LMCMA(random::rng_type& rng = random::globalRng):mpe_rng(&rng){
203  m_features |= REQUIRES_VALUE;
204  }
205 
206 
207  /// \brief From INameable: return the class name.
208  std::string name() const
209  { return "LMCMA-ES"; }
210 
211  /// \brief Calculates lambda for the supplied dimensionality n.
212  unsigned suggestLambda( unsigned int dimension ) {
213  unsigned lambda = unsigned( 4. + ::floor( 3. * ::log( static_cast<double>( dimension ) ) ) );
214  // heuristic for small search spaces
215  lambda = std::max<unsigned int>( 5, std::min( lambda, dimension ) );
216  return( lambda );
217  }
218 
219  /// \brief Calculates mu for the supplied lambda and the recombination strategy.
220  double suggestMu( unsigned int lambda) {
221  return lambda / 2.;
222  }
223 
225 
226  /// \brief Initializes the algorithm for the supplied objective function.
227  void init( ObjectiveFunctionType const& function, SearchPointType const& p) {
228  unsigned int lambda = suggestLambda( p.size() );
229  unsigned int mu = suggestMu( lambda );
230  init( function,
231  p,
232  lambda,
233  mu,
234  1.0/std::sqrt(double(p.size()))
235  );
236  }
237 
238  /// \brief Initializes the algorithm for the supplied objective function.
239  void init(
240  ObjectiveFunctionType const& function,
241  SearchPointType const& initialSearchPoint,
242  unsigned int lambda,
243  double mu,
244  double initialSigma
245  ) {
246  checkFeatures(function);
247 
248  m_numberOfVariables = function.numberOfVariables();
249  m_lambda = lambda;
250  m_mu = static_cast<unsigned int>(::floor(mu));
251 
252  //set initial point
253  m_mean = initialSearchPoint;
254  m_best.point = initialSearchPoint;
255  m_best.value = function(initialSearchPoint);
256 
257  //init step size adaptation
258  m_stepSize.init(initialSigma);
259 
260  //weighting of the mu-best individuals
261  m_weights.resize(m_mu);
262  for (unsigned int i = 0; i < m_mu; i++){
263  m_weights(i) = ::log(mu + 0.5) - ::log(1. + i);
264  }
265  m_weights /= sum(m_weights);
266 
267  // learning rates
268  m_muEff = 1. / sum(sqr(m_weights)); // equal to sum(m_weights)^2 / sum(sqr(m_weights))
269  double c1 = 1/(10*std::log(m_numberOfVariables+1.0));
270  m_cC =1.0/m_lambda;
271 
272  //init variables for covariance matrix update
273  m_evolutionPathC = blas::repeat(0.0,m_numberOfVariables);
274  m_A.init(c1,m_numberOfVariables,lambda,lambda);
275  }
276 
277  /// \brief Executes one iteration of the algorithm.
278  void step(ObjectiveFunctionType const& function){
280  std::vector< IndividualType > offspring( m_lambda );
281 
282  PenalizingEvaluator penalizingEvaluator;
283  for( unsigned int i = 0; i < offspring.size(); i++ ) {
284  createSample(offspring[i].searchPoint(),offspring[i].chromosome());
285  }
286  penalizingEvaluator( function, offspring.begin(), offspring.end() );
287 
288  // Selection and parameter update
289  // opposed to normal CMA selection, we don't remove any indidivudals but only order
290  // them by rank to allow the use of the population based strategy.
291  std::vector< IndividualType > parents( lambda() );
293  selection(offspring.begin(),offspring.end(),parents.begin(), parents.end());
294  updateStrategyParameters( parents );
295 
296  //update the best solution found so far.
297  m_best.point= parents[ 0 ].searchPoint();
298  m_best.value= parents[ 0 ].unpenalizedFitness();
299  }
300 
301  /// \brief Accesses the current step size.
302  double sigma() const {
303  return m_stepSize.stepSize();
304  }
305 
306  /// \brief Accesses the current population mean.
307  RealVector const& mean() const {
308  return m_mean;
309  }
310 
311  /// \brief Accesses the current weighting vector.
312  RealVector const& weights() const {
313  return m_weights;
314  }
315 
316  /// \brief Accesses the evolution path for the covariance matrix update.
317  RealVector const& evolutionPath() const {
318  return m_evolutionPathC;
319  }
320 
321  /// \brief Returns the size of the parent population \f$\mu\f$.
322  unsigned int mu() const {
323  return m_mu;
324  }
325 
326  /// \brief Returns a mutabl rference to the size of the parent population \f$\mu\f$.
327  unsigned int& mu(){
328  return m_mu;
329  }
330 
331  /// \brief Returns a immutable reference to the size of the offspring population \f$\mu\f$.
332  unsigned int lambda()const{
333  return m_lambda;
334  }
335 
336  /// \brief Returns a mutable reference to the size of the offspring population \f$\mu\f$.
337  unsigned int & lambda(){
338  return m_lambda;
339  }
340 
341 private:
342  /// \brief Updates the strategy parameters based on the supplied offspring population.
343  void updateStrategyParameters( std::vector<Individual<RealVector, double, RealVector> > const& offspring ) {
344  //line 8, creation of the new mean (but not updating the mean of the distribution
345  RealVector m( m_numberOfVariables, 0. );
346  for( unsigned int j = 0; j < mu(); j++ ){
347  noalias(m) += m_weights( j ) * offspring[j].searchPoint();
348  }
349 
350  //update evolution path, line 9
351  noalias(m_evolutionPathC) = (1. - m_cC ) * m_evolutionPathC + std::sqrt( m_cC * (2. - m_cC) * m_muEff ) * (m - m_mean) / sigma();
352 
353  //update mean now, as oldmean is not needed any more (line 8 continued)
354  m_mean = m;
355 
356  //corrected version of lines 10-14- the covariance matrix adaptation
357  //we replace one vector that makes up the approximation of A by the newly updated evolution path
358  m_A.update(m_evolutionPathC);
359 
360  //update the step size using the population success rule, line 15-18
361  m_stepSize.update(offspring);
362 
363  }
364 
365  /// \brief Creates a vector-sample pair x=Az, where z is a gaussian random vector.
366  void createSample(RealVector& x,RealVector& z)const{
367  x.resize(m_numberOfVariables);
368  z.resize(m_numberOfVariables);
369  for(std::size_t i = 0; i != m_numberOfVariables; ++i){
370  z(i) = gauss(*mpe_rng,0,1);
371  }
372  m_A.prod(x,z);
373  noalias(x) = sigma()*x +m_mean;
374  }
375 
376  unsigned int m_numberOfVariables; ///< Stores the dimensionality of the search space.
377  unsigned int m_mu; ///< The size of the parent population.
378  unsigned int m_lambda; ///< The size of the offspring population, needs to be larger than mu.
379 
380  double m_cC;///< learning rate of the evolution path
381 
382 
383  detail::IncrementalCholeskyMatrix m_A;
384  PopulationBasedStepSizeAdaptation m_stepSize;///< step size adaptation for the step size sigma()
385 
386  RealVector m_mean; ///< current mean of the distribution
387  RealVector m_weights;///< weighting for the mu best individuals
388  double m_muEff;///< effective sample size for the weighted samples
389 
390  RealVector m_evolutionPathC;///< evolution path
391 
392  random::rng_type* mpe_rng;
393 
394 };
395 
396 }
397 
398 #endif