Rprop.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief implements different versions of Resilient Backpropagation of error.
5  *
6  *
7  *
8  *
9  * \author Oswin Krause
10  * \date 2010
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 
34 #ifndef SHARK_ML_OPTIMIZER_RPROP_H
35 #define SHARK_ML_OPTIMIZER_RPROP_H
36 
37 #include <shark/Core/DLLSupport.h>
39 
40 namespace shark{
41 
42 /*!
43  * \brief This class offers methods for the usage of the
44  * Resilient-Backpropagation-algorithm without weight-backtracking.
45  *
46  * The Rprop algorithm is an improvement of the algorithms with adaptive
47  * learning rates (as the Adaptive Backpropagation algorithm by Silva
48  * and Ameida, please see AdpBP.h for a description of the
49  * working of such an algorithm), that uses increments for the update
50  * of the weights, that are independent from the absolute partial
51  * derivatives. This makes sense, because large flat regions
52  * in the search space (plateaus) lead to small absolute partial
53  * derivatives and so the increments are chosen small, but the increments
54  * should be large to skip the plateau. In contrast, the absolute partial
55  * derivatives are very large at the "slopes" of very "narrow canyons",
56  * which leads to large increments that will skip the minimum lying
57  * at the bottom of the canyon, but it would make more sense to
58  * chose small increments to hit the minimum. <br>
59  * So, the Rprop algorithm only uses the signs of the partial derivatives
60  * and not the absolute values to adapt the parameters. <br>
61  * Instead of individual learning rates, it uses the parameter
62  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
63  * iteration \f$t\f$, where the parameter will be adapted before the
64  * change of the weights: <br>
65  *
66  * \f$
67  * \Delta_i^{(t)} = \Bigg\{
68  * \begin{array}{ll}
69  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
70  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
71  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
72  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
73  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
74  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
75  * \Delta_i^{(t-1)}, & \mbox{otherwise}
76  * \end{array}
77  * \f$
78  *
79  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
80  * the speed of the adaptation. To stabilize the increments, they are
81  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
82  * After the adaptation of the \f$\Delta_i\f$ the update for the
83  * weights will be calculated as
84  *
85  * \f$
86  * \Delta w_i^{(t)} := - \mbox{sign}
87  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
88  * \f$
89  *
90  * For further information about the algorithm, please refer to: <br>
91  *
92  * Martin Riedmiller, <br>
93  * "Advanced Supervised Learning in Multi-layer Perceptrons -
94  * From Backpropagation to Adaptive Learning Algorithms". <br>
95  * In "International Journal of Computer Standards and Interfaces", volume 16,
96  * no. 5, 1994, pp. 265-278 <br>
97  *
98  * \author C. Igel
99  * \date 1999
100  *
101  * \par Changes:
102  * none
103  *
104  * \par Status:
105  * stable
106  *
107  */
108 class RpropMinus : public AbstractSingleObjectiveOptimizer<RealVector >
109 {
110 public:
112 
113  /// \brief From INameable: return the class name.
114  std::string name() const
115  { return "RpropMinus"; }
116 
117  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint);
118  SHARK_EXPORT_SYMBOL virtual void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
120 
121  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
122 
123  SHARK_EXPORT_SYMBOL virtual void read( InArchive & archive );
124  SHARK_EXPORT_SYMBOL virtual void write( OutArchive & archive ) const;
125 
126  //! set decrease factor
127  void setEtaMinus(double etaMinus) {
128  RANGE_CHECK( etaMinus < 1 );
129  RANGE_CHECK( etaMinus > 0 );
130  m_decreaseFactor = etaMinus;
131  }
132 
133  //! set increase factor
134  void setEtaPlus(double etaPlus) {
135  RANGE_CHECK( etaPlus > 1 );
136  m_increaseFactor = etaPlus;
137  }
138 
139  //! set upper limit on update
140  void setMaxDelta(double d) {
141  RANGE_CHECK( d > 0 );
142  m_maxDelta = d;
143  }
144 
145  //! set lower limit on update
146  void setMinDelta(double d) {
147  RANGE_CHECK( d >= 0 );
148  m_minDelta = d;
149  }
150 
151  //! return the maximal step size component
152  double maxDelta() const {
153  return *std::max_element(m_delta.begin(),m_delta.end());
154  }
155 
156  /// \brief Returns the derivative at the current point. Can be used for stopping criteria.
157  RealVector const& derivative()const{
158  return m_derivative;
159  }
160 protected:
162 
163  //! The increase factor \f$ \eta^+ \f$, set to 1.2 by default.
165 
166  //! The decrease factor \f$ \eta^- \f$, set to 0.5 by default.
168 
169  //! The upper limit of the increments \f$ \Delta w_i^{(t)} \f$, set to 1e100 by default.
170  double m_maxDelta;
171 
172  //! The lower limit of the increments \f$ \Delta w_i^{(t)} \f$, set to 0.0 by default.
173  double m_minDelta;
174 
176 
177  //! The last error gradient.
178  RealVector m_oldDerivative;
179 
180  //! The absolute update values (increment) for all weights.
181  RealVector m_delta;
182 };
183 
184 //===========================================================================
185 /*!
186  * \brief This class offers methods for the usage of the
187  * Resilient-Backpropagation-algorithm with weight-backtracking.
188  *
189  * The Rprop algorithm is an improvement of the algorithms with adaptive
190  * learning rates (as the Adaptive Backpropagation algorithm by Silva
191  * and Ameida, please see AdpBP.h for a description of the
192  * working of such an algorithm), that uses increments for the update
193  * of the weights, that are independent from the absolute partial
194  * derivatives. This makes sense, because large flat regions
195  * in the search space (plateaus) lead to small absolute partial
196  * derivatives and so the increments are chosen small, but the increments
197  * should be large to skip the plateau. In contrast, the absolute partial
198  * derivatives are very large at the "slopes" of very "narrow canyons",
199  * which leads to large increments that will skip the minimum lying
200  * at the bottom of the canyon, but it would make more sense to
201  * chose small increments to hit the minimum. <br>
202  * So, the Rprop algorithm only uses the signs of the partial derivatives
203  * and not the absolute values to adapt the parameters. <br>
204  * Instead of individual learning rates, it uses the parameter
205  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
206  * iteration \f$t\f$, where the parameter will be adapted before the
207  * change of the weights: <br>
208  *
209  * \f$
210  * \Delta_i^{(t)} = \Bigg\{
211  * \begin{array}{ll}
212  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
213  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
214  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
215  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
216  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
217  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
218  * \Delta_i^{(t-1)}, & \mbox{otherwise}
219  * \end{array}
220  * \f$
221  *
222  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
223  * the speed of the adaptation. To stabilize the increments, they are
224  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
225  * After the adaptation of the \f$\Delta_i\f$ the update for the
226  * weights will be calculated as
227  *
228  * \f$
229  * \Delta w_i^{(t)} := - \mbox{sign}
230  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
231  * \f$
232  *
233  * Furthermore, weight-backtracking will take place to increase the
234  * stability of the method, i.e.
235  * if \f$\frac{\partial E^{(t-1)}}{\partial w_i} \cdot
236  * \frac{\partial E^{(t)}}{\partial w_i} < 0\f$ then
237  * \f$\Delta w_i^{(t)} := - \Delta w_i^{(t-1)};
238  * \frac{\partial E^{(t)}}{\partial w_i} := 0\f$, where
239  * the assignment of zero to the partial derivative of the error
240  * leads to a freezing of the increment in the next iteration.
241  *
242  * For further information about the algorithm, please refer to: <br>
243  *
244  * Martin Riedmiller and Heinrich Braun, <br>
245  * "A Direct Adaptive Method for Faster Backpropagation Learning: The
246  * RPROP Algorithm". <br>
247  * In "Proceedings of the IEEE International Conference on Neural Networks",
248  * pp. 586-591, <br>
249  * Published by IEEE Press in 1993
250  *
251  * \author C. Igel
252  * \date 1999
253  *
254  * \par Changes:
255  * none
256  *
257  * \par Status:
258  * stable
259  *
260  */
261 class RpropPlus : public RpropMinus
262 {
263 public:
265 
266  /// \brief From INameable: return the class name.
267  std::string name() const
268  { return "RpropPlus"; }
269 
270  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint);
271  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
273 
274  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
275  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
276  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
277 
278 protected:
279  //! The final update values for all weights.
280  RealVector m_deltaw;
281 };
282 
283 
284 
285 
286 /*!
287  * \brief This class offers methods for the usage of the improved
288  * Resilient-Backpropagation-algorithm with weight-backtracking.
289  *
290  * The Rprop algorithm is an improvement of the algorithms with adaptive
291  * learning rates (as the Adaptive Backpropagation algorithm by Silva
292  * and Ameida, please see AdpBP.h for a description of the
293  * working of such an algorithm), that uses increments for the update
294  * of the weights, that are independent from the absolute partial
295  * derivatives. This makes sense, because large flat regions
296  * in the search space (plateaus) lead to small absolute partial
297  * derivatives and so the increments are chosen small, but the increments
298  * should be large to skip the plateau. In contrast, the absolute partial
299  * derivatives are very large at the "slopes" of very "narrow canyons",
300  * which leads to large increments that will skip the minimum lying
301  * at the bottom of the canyon, but it would make more sense to
302  * chose small increments to hit the minimum. <br>
303  * So, the Rprop algorithm only uses the signs of the partial derivatives
304  * and not the absolute values to adapt the parameters. <br>
305  * Instead of individual learning rates, it uses the parameter
306  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
307  * iteration \f$t\f$, where the parameter will be adapted before the
308  * change of the weights: <br>
309  *
310  * \f$
311  * \Delta_i^{(t)} = \Bigg\{
312  * \begin{array}{ll}
313  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
314  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
315  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
316  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
317  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
318  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
319  * \Delta_i^{(t-1)}, & \mbox{otherwise}
320  * \end{array}
321  * \f$
322  *
323  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
324  * the speed of the adaptation. To stabilize the increments, they are
325  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
326  * After the adaptation of the \f$\Delta_i\f$ the update for the
327  * weights will be calculated as
328  *
329  * \f$
330  * \Delta w_i^{(t)} := - \mbox{sign}
331  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
332  * \f$
333  *
334  * Furthermore, weight-backtracking will take place to increase the
335  * stability of the method. In contrast to the original Rprop algorithm
336  * with weight-backtracking (see RpropPlus) this weight-backtracking
337  * is improved by additionally taken the error of the last iteration
338  * \f$t - 1\f$ into account. <br>
339  * The idea of this modification is, that a change of the sign of the
340  * partial derivation \f$\frac{\partial E}{\partial w_i}\f$
341  * only states, that a minimum was skipped and not, whether this step
342  * lead to an approach to the minimum or not. <br>
343  * By using the old error value the improved weight-backtracking only
344  * undoes changes, when the error has increased and only the parameters
345  * \f$w_i\f$ are reset to the old values, where a sign change of
346  * \f$\frac{\partial E}{\partial w_i}\f$ has taken place. <br>
347  * So the new weight-backtracking rule is: <br>
348  *
349  * \f$
350  * \mbox{if\ } \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
351  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \mbox{\ then} \{
352  * \f$
353  *
354  * \f$
355  * \begin{array}{lll}
356  * \Delta w_i^{(t)} = \bigg\{ &
357  * - \Delta w_i^{(t-1)}, & \mbox{if\ } E^{(t)} > E^{(t - 1)} \\
358  * & 0, & otherwise \\
359  * \frac{\partial E^{(t)}}{\partial w_i} := 0
360  * \end{array}
361  * \f$
362  *
363  * \f$\}\f$
364  *
365  * , where the assignment of zero to the partial derivative of the error
366  * leads to a freezing of the increment in the next iteration. <br>
367  *
368  * This modification of the weight backtracking leads to a better
369  * optimization on artifical, paraboloidal error surfaces. <br>
370  *
371  * For further information about the algorithm, please refer to: <br>
372  *
373  * Christian Igel and Michael H&uuml;sken, <br>
374  * "Empirical Evaluation of the Improved Rprop Learning Algorithm". <br>
375  * In Neurocomputing Journal, 2002, in press <br>
376  *
377  * \author C. Igel, M. H&uuml;sken
378  * \date 1999
379  *
380  * \par Changes:
381  * none
382  *
383  * \par Status:
384  * stable
385  *
386  */
387 class IRpropPlus : public RpropPlus
388 {
389 public:
391 
392  /// \brief From INameable: return the class name.
393  std::string name() const
394  { return "IRpropPlus"; }
395 
396  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint);
397  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
399 
400  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
401 
402  SHARK_EXPORT_SYMBOL void setDerivativeThreshold(double derivativeThreshold);
403 
404  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
405  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
406 
407 protected:
408  //! The error of the last iteration.
409  double m_oldError;
410  //! A threshold below which partial derivatives are set to zero
412 
413 };
414 
415 
416 class IRpropPlusFull : public RpropPlus
417 {
418 public:
420 
421  /// \brief From INameable: return the class name.
422  std::string name() const
423  { return "IRpropPlusFull"; }
424 
425  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint);
426  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType const& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
428 
429  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
430 
431  SHARK_EXPORT_SYMBOL void setDerivativeThreshold(double derivativeThreshold);
432 
433  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
434  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
435 
436 protected:
437  //! The error of the last iteration.
438  double m_oldError;
439  //! A threshold below which partial derivatives are set to zero
441 
442 };
443 
444 //===========================================================================
445 /*!
446  * \brief This class offers methods for the usage of the improved
447  * Resilient-Backpropagation-algorithm without weight-backtracking.
448  *
449  * The Rprop algorithm is an improvement of the algorithms with adaptive
450  * learning rates (as the Adaptive Backpropagation algorithm by Silva
451  * and Ameida, please see AdpBP.h for a description of the
452  * working of such an algorithm), that uses increments for the update
453  * of the weights, that are independent from the absolute partial
454  * derivatives. This makes sense, because large flat regions
455  * in the search space (plateaus) lead to small absolute partial
456  * derivatives and so the increments are chosen small, but the increments
457  * should be large to skip the plateau. In contrast, the absolute partial
458  * derivatives are very large at the "slopes" of very "narrow canyons",
459  * which leads to large increments that will skip the minimum lying
460  * at the bottom of the canyon, but it would make more sense to
461  * chose small increments to hit the minimum. <br>
462  * So, the Rprop algorithm only uses the signs of the partial derivatives
463  * and not the absolute values to adapt the parameters. <br>
464  * Instead of individual learning rates, it uses the parameter
465  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
466  * iteration \f$t\f$, where the parameter will be adapted before the
467  * change of the weights. <br>
468  * As an improving modification, this algorithm
469  * adapts the "freezing" of the increment in the next iteration as
470  * usually only practiced by the Rprop algorithm with weight-backtracking
471  * (see RpropPlus), i.e. \f$\frac{\partial E^{(t)}}{\partial w_i} := 0\f$.
472  * Tests have shown a far more better optimization when using this
473  * modification. So the new adaptation rule of \f$\Delta\f$ is given
474  * as: <br>
475  *
476  * \f$
477  * \Delta_i^{(t)} = \Bigg\{
478  * \begin{array}{ll}
479  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
480  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
481  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
482  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} );
483  * \frac{\partial E^{(t)}}{\partial w_i} := 0, & \mbox{if\ }
484  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
485  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
486  * \Delta_i^{(t-1)}, & \mbox{otherwise}
487  * \end{array}
488  * \f$
489  *
490  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
491  * the speed of the adaptation. To stabilize the increments, they are
492  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
493  * After the adaptation of the \f$\Delta_i\f$ the update for the
494  * weights will be calculated as
495  *
496  * \f$
497  * \Delta w_i^{(t)} := - \mbox{sign}
498  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
499  * \f$
500  *
501  * For further information about the algorithm, please refer to: <br>
502  *
503  * Christian Igel and Michael H&uuml;sken, <br>
504  * "Empirical Evaluation of the Improved Rprop Learning Algorithm". <br>
505  * In Neurocomputing Journal, 2002, in press <br>
506  *
507  *
508  * \author C. Igel, M. H&uuml;sken
509  * \date 1999
510  *
511  * \par Changes:
512  * none
513  *
514  * \par Status:
515  * stable
516  *
517  */
518 class IRpropMinus : public RpropMinus {
519 public:
521 
522  /// \brief From INameable: return the class name.
523  std::string name() const
524  { return "IRpropMinus"; }
525 
526  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
527 };
528 
529 //! Used to connect the class names with the year of
530 //! publication of the paper in which the algorithm was introduced.
532 
533 //! Used to connect the class names with the year of
534 //! publication of the paper in which the algorithm was introduced.
536 
537 //! Used to connect the class names with the year of
538 //! publication of the paper in which the algorithm was introduced.
540 
541 //! Used to connect the class names with the year of
542 //! publication of the paper in which the algorithm was introduced.
544 }
545 
546 #endif
547