MergeBudgetMaintenanceStrategy.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Merge budget maintenance strategy
6  *
7  * \par
8  * This is an budget strategy that adds a new vector by merging
9  * a pair of budget vectors. The pair to merge is found by first
10  * searching for the budget vector with the smallest alpha-coefficients
11  * (measured in 2-norm), and then finding the second one by
12  * computing a certain degradation measure. This is therefore linear
13  * in the size of the budget.
14  *
15  * \par
16  * The method is an implementation of the merge strategy
17  * given in wang, crammer, vucetic: "Breaking the Curse of Kernelization:
18  * Budgeted Stochastic Gradient Descent for Large-Scale SVM Training"
19  * and owes very much to the implementation in BudgetedSVM.
20  *
21  *
22  *
23  * \author Aydin Demircioglu
24  * \date 2014
25  *
26  *
27  * \par Copyright 1995-2017 Shark Development Team
28  *
29  * <BR><HR>
30  * This file is part of Shark.
31  * <http://shark-ml.org/>
32  *
33  * Shark is free software: you can redistribute it and/or modify
34  * it under the terms of the GNU Lesser General Public License as published
35  * by the Free Software Foundation, either version 3 of the License, or
36  * (at your option) any later version.
37  *
38  * Shark is distributed in the hope that it will be useful,
39  * but WITHOUT ANY WARRANTY; without even the implied warranty of
40  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
41  * GNU Lesser General Public License for more details.
42  *
43  * You should have received a copy of the GNU Lesser General Public License
44  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
45  *
46  */
47 //===========================================================================
48 
49 
50 #ifndef SHARK_MODELS_MERGEBUDGETMAINTENANCESTRATEGY_H
51 #define SHARK_MODELS_MERGEBUDGETMAINTENANCESTRATEGY_H
52 
54 #include <shark/Data/Dataset.h>
55 #include <shark/Data/DataView.h>
57 
59 
60 
61 namespace shark
62 {
63 
64 ///
65 /// \brief Budget maintenance strategy that merges two vectors
66 ///
67 /// \par This is an budget strategy that simply merges two budget vectors
68 /// in order to make space for a new one. This is done by first searching
69 /// for the budget vector that has smallest \f[ ||\alpha||_2\f] coefficient-- only
70 /// then a second one is searched for, by inspecting the expected
71 /// weight degradation after merging. The vector with smallest weight
72 /// degradation is the vector one should merge with the first one.
73 /// By this heuristic, the merging strategy has complexity \f[ \mathcal{O}(B) \f].
74 /// Compared with the projection strategy, merging should be faster, and stil
75 /// obtains similar accuracy. Unluckily any kind of timing numbers are missing
76 /// in the reference paper of Wang, Crammer and Vucetic.
77 ///
78 /// \par Note that in general it is unclear how two data objects should be merged,
79 /// e.g. strings must be merged differently than vectors. Therefore it is necessary
80 /// to create an specialization of this strategy for a given input type.
81 ///
82 template<class InputType>
84 
85 
86 
87 ///
88 /// \brief Budget maintenance strategy merging vectors.
89 ///
90 /// \par This is an specialization of the merge budget maintenance strategy
91 /// that handles simple real-valued vectors. This is a nearly 1:1 adoption of
92 /// the strategy presented in Wang, Cramer and Vucetic.
93 ///
94 template<>
96 {
99  typedef RealVector InputType;
100 
101 public:
102 
103  /// This is the objective function we need to optimize during merging.
104  /// Basically the merging strategy needs a line search to find the parameter
105  /// which maximizes \f[ a \cdot k_h (x_m, x_n) + b \cdot k_{1-h}(x_m, x_n) \f].
106  /// (all in the notation of wang, crammer and vucetic).
107  /// The coefficients a and b are given by the alpha coefficients of the
108  /// corresponding support vectors \f[ x_m \f] and \f[ x_n\f], more precicely
109  /// we have \f[ a = \sum \alpha_m^{(i)}/d_i\f] and \f[b = 1 - a = \sum \alpha_n^{(i)}/d_i\f]
110  /// with \f[d_i = \alpha_m^{(i)} + \alpha_n^{(i)} \f].
111  ///
112  struct MergingProblemFunction : public SingleObjectiveFunction
113  {
115 
116  /// class name
117  std::string name() const
118  { return "MergingProblemFunction"; }
119 
120 
121  /// parameters for the function.
122  double m_a, m_b;
123  double m_k; //< contains (in our problem) k(x_m, x_n)
124 
125 
126  /// constructor.
127  /// \param[in] a a coefficient of the formula
128  /// \param[in] b b coefficient of the formula
129  /// \param[in] k k coefficient of the formula
130  ///
131  MergingProblemFunction(double a, double b, double k)
132  {
133  m_a = a;
134  m_b = b;
135  m_k = k;
136  }
137 
138 
139  /// number of variables, we have a one-dimensional problem here.
140  std::size_t numberOfVariables()const
141  {
142  return 1;
143  }
144 
145 
146  /// evaluation
147  /// \param[in] pattern vector to evaluate the function at. as we have a 1d problem,
148  /// we ignore everything beyond the first component.
149  /// \return function value at the point
150  ///
151  virtual double eval(RealVector const& pattern)const
152  {
153  double h = pattern(0);
154  // we want to maximize, thus minimize -function
155  return (- (m_a * pow(m_k, (1.0 - h) * (1.0 - h)) + m_b * pow(m_k, h * h)));
156  }
157 
158 
159  /// Derivative of function.
160  /// Unsure if the derivative is really needed, but wolfram alpha
161  /// helped computing it, do not want to let it down, wasting its capacity.
162  /// The search routine uses it, as we did not removed the derivative-feature.
163  /// \param[in] input Point to evaluate the function at
164  /// \param[out] derivative Derivative at the given point
165  /// \return Function value at the point
166  ///
167  virtual double evalDerivative(const SearchPointType & input, FirstOrderDerivative & derivative)const
168  {
169  double h = input(0);
170  // we want to maximize, thus minimize -function
171  derivative(0) = 2 * log(m_k) * (-m_a * (h - 1.0) * pow(m_k, (h - 1.0) * (h - 1.0))
172  - m_b * h * pow(m_k, (h * h)));
173  return eval(input);
174  }
175  };
176 
177 
178 
179  /// Reduce the budget.
180  /// This is a helper routine. after the addToModel adds the new support vector
181  /// to the end of the budget (it was chosen one bigger than the capacity),
182  /// this routine will do the real merging. Given a index it will search for a second
183  /// index, so that merging is 'optimal'. It then will perform the merging. After that
184  /// the last budget vector will be freed again (by setting its alpha-coefficients to zero).
185  /// \param[in] model Model to work on
186  /// \param[in] firstIndex The index of the first element of the pair to merge.
187  ///
188  virtual void reduceBudget(ModelType& model, size_t firstIndex)
189  {
190  size_t maxIndex = model.basis().numberOfElements();
191 
192  // compute the kernel row of the given, first element and all the others
193  // should take O(B) time, as it is a row of size B
194  blas::vector<float> kernelRow(maxIndex, 0.0);
195  for(size_t j = 0; j < maxIndex; j++)
196  kernelRow(j) = static_cast<float>( model.kernel()->eval(model.basis().element(firstIndex), model.basis().element(j)));
197 
198  // initialize the search
199  double fret(0.);
200  RealVector h(1); // initial search starting point
201  RealVector xi(1); // direction of search
202  RealVector d(1); // derivative ater line-search (not needed)
203 
204  // save the parameter at the minimum
205  double minDegradation = std::numeric_limits<double>::infinity();
206  double minH = 0.0;
207  double minAlphaMergedFirst = 0.0;
208  double minAlphaMergedSecond = 0.0;
209  size_t secondIndex = 0;
210 
211 
212  // we need to check every other vector
213  RealMatrix &alpha = model.alpha();
214  for(size_t currentIndex = 0; currentIndex < maxIndex; currentIndex++)
215  {
216  // we do not want the vector already chosen
217  if(firstIndex == currentIndex)
218  continue;
219 
220  // compute the alphas for the model, this is the formula
221  // between (6.7) and (6.8) in wang, crammer, vucetic
222  double a = 0.0;
223  double b = 0.0;
224  for(size_t c = 0; c < alpha.size2(); c++)
225  {
226  double d = std::min(0.00001, alpha(currentIndex, c) + alpha(firstIndex, c));
227  a += alpha(firstIndex, c) / d;
228  b += alpha(currentIndex, c) / d;
229  }
230 
231  // Initialize search starting point and direction:
232  h(0) = 0.0;
233  xi(0) = 0.5;
234  double k = kernelRow(currentIndex);
235  MergingProblemFunction mergingProblemFunction(a, b, k);
236  fret = mergingProblemFunction.evalDerivative(h,d);
237  //perform a line-search
238  LineSearch lineSearch;
239  lineSearch.lineSearchType() = LineSearch::Dlinmin;
240  lineSearch.init(mergingProblemFunction);
241  lineSearch(h,fret,xi,d,1.0);
242 
243  // the optimal point is now given by h.
244  // the vector that corresponds to this is
245  // $z = h x_m + (1-h) x_n$ by formula (6.7)
246  RealVector firstVector = model.basis().element(firstIndex);
247  RealVector currentVector = model.basis().element(currentIndex);
248  RealVector mergedVector = h(0) * firstVector + (1.0 - h(0)) * currentVector;
249 
250  // this is another minimization problem, which has as optimal
251  // solution $\alpha_z^{(i)} = \alpha_m^{(i)} k(x_m, z) + \alpha_n^{(i)} k(x_n, z).$
252 
253  // the coefficient of this z is computed by approximating
254  // both vectors by the merged one. maybe KernelBasisDistance can be used
255  // but i am not sure, if at all and if, if its faster.
256 
257  long double alphaMergedFirst = pow(k, (1.0 - h(0)) * (1.0 - h(0)));
258  long double alphaMergedCurrent = pow(k, h(0) * h(0));
259 
260  // degradation is computed for each class
261  // this is computed by using formula (6.8), applying it to each class and summing up
262  // here a kernel with $k(x,x) = 1$ is assumed
263  double currentDegradation = 0.0f;
264  for(size_t c = 0; c < alpha.size2(); c++)
265  {
266  double zAlpha = alphaMergedFirst * alpha(firstIndex, c) + alphaMergedCurrent * alpha(currentIndex, c);
267  // TODO: unclear to me why this is the thing we want to compute
268  currentDegradation += pow(alpha(firstIndex, c), 2) + pow(alpha(currentIndex, c), 2) +
269  2.0 * k * alpha(firstIndex, c) * alpha(currentIndex, c) - zAlpha * zAlpha;
270  }
271 
272  // TODO: this is shamelessly copied, as the rest, but maybe i want to refactor it and make it nicer.
273  if(currentDegradation < minDegradation)
274  {
275  minDegradation = currentDegradation;
276  minH = h(0);
277  minAlphaMergedFirst = alphaMergedFirst;
278  minAlphaMergedSecond = alphaMergedCurrent;
279  secondIndex = currentIndex;
280  }
281  }
282 
283  // compute merged vector
284  RealVector firstVector = model.basis().element(firstIndex);
285  RealVector secondVector = model.basis().element(secondIndex);
286  RealVector mergedVector = minH * firstVector + (1.0 - minH) * secondVector;
287 
288  // replace the second vector by the merged one
289  model.basis().element(secondIndex) = mergedVector;
290 
291  // and update the alphas
292  for(size_t c = 0; c < alpha.size2(); c++)
293  {
294  alpha(secondIndex, c) = minAlphaMergedFirst * alpha(firstIndex, c) + minAlphaMergedSecond * alpha(secondIndex, c);
295  }
296 
297  // the first index is now obsolete, so we copy the
298  // last vector, which serves as a buffer, to this position
299  row(alpha, firstIndex) = row(alpha, maxIndex - 1);
300  model.basis().element(firstIndex) = model.basis().element(maxIndex - 1);
301 
302  // clear the buffer by cleaning the alphas
303  // finally the vectors we merged.
304  row(model.alpha(), maxIndex - 1).clear();
305  }
306 
307 
308 
309  /// add a vector to the model.
310  /// this will add the given vector to the model and merge the budget so that afterwards
311  /// the budget size is kept the same. If the budget has a free entry anyway, no merging
312  /// will be performed, but instead the given vector is simply added to the budget.
313  ///
314  /// @param[in,out] model the model the strategy will work with
315  /// @param[in] alpha alphas for the new budget vector
316  /// @param[in] supportVector the vector to add to the model by applying the maintenance strategy
317  ///
318  virtual void addToModel(ModelType& model, InputType const& alpha, ElementType const& supportVector)
319  {
320 
321  // find the two indicies we want to merge
322 
323  // note that we have to crick ourselves, as the model has
324  // a fixed size, but actually we want to work with the model
325  // together with the new supportvector. so our budget size
326  // is one greater than the user specified and we use this
327  // last entry of the model for buffer. it will be freed again,
328  // when merging is finished.
329 
330  // put the new vector into place
331  size_t maxIndex = model.basis().numberOfElements();
332  model.basis().element(maxIndex - 1) = supportVector.input;
333  row(model.alpha(), maxIndex - 1) = alpha;
334 
335 
336  // the first vector to merge is the one with the smallest alpha coefficient
337  // (it cannot be our new vector, because in each iteration the
338  // old weights get downscaled and the new ones get the biggest)
339  size_t firstIndex = 0;
340  double firstAlpha = 0;
341  findSmallestVector(model, firstIndex, firstAlpha);
342 
343  // if the smallest vector has zero alpha,
344  // the budget is not yet filled so we can skip merging it.
345  if(firstAlpha == 0.0f)
346  {
347  // as we need the last vector to be zero, we put the new
348  // vector to that place and undo our putting-the-vector-to-back-position
349  model.basis().element(firstIndex) = supportVector.input;
350  row(model.alpha(), firstIndex) = alpha;
351 
352  // enough to zero out the alpha
353  row(model.alpha(), maxIndex - 1).clear();
354 
355  // ready.
356  return;
357  }
358 
359  // the second one is given by searching for the best match now,
360  // taking O(B) time. we also have to provide to the findVectorToMerge
361  // function the supportVector we want to add, as we cannot, as
362  // said, just extend the model with this vector.
363  reduceBudget(model, firstIndex);
364  }
365 
366 
367  /// class name
368  std::string name() const
369  { return "MergeBudgetMaintenanceStrategy"; }
370 
371 
372 protected:
373 };
374 
375 }
376 #endif