MissingFeatureSvmTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Trainer for binary SVMs natively supporting missing features.
6  *
7  *
8  *
9  * \author B. Li
10  * \date 2012
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_ALGORITHMS_TRAINERS_MISSING_FEATURE_SVM_H
36 #define SHARK_ALGORITHMS_TRAINERS_MISSING_FEATURE_SVM_H
37 
46 
47 namespace shark {
48 
49 /// \brief Trainer for binary SVMs natively supporting missing features.
50 ///
51 /// This is a specialized variant of the standard binary C-SVM which can be used
52 /// to learn from data with missing features, without the need for prior imputation
53 /// of missing values. The key idea is that each example is considered as having an
54 /// instance-specific margin value, which is computed in the lower-dimensional subspace
55 /// for which all features of that example are present.
56 ///
57 /// The resulting optimization problem has the drawback that it is not convex any
58 /// more. However, a local minimum can be easily reached by an iterative wrapper
59 /// algorithm around a standard C-SVM solver. In detail, example-wise weights \f$ s_i \f$
60 /// are incorporated into the quadratic term of the standard SVM optimization problem.
61 /// These take initial values of 1, and are then iteratively updated according to the
62 /// instance-specific margin values. After each such update, the SVM solver is again called,
63 /// and so on. Usually, between 2 and 5 iterations have been shown to be sufficient for
64 /// an acceptable level of convergence.
65 ///
66 /// For details, see the paper:<br/>
67 /// <p>Max-margin Classification of %Data with Absent Features
68 /// Gal Chechik, Geremy Heitz, Gal Elidan, Pieter Abbeel and Daphne Koller. JMLR 2008.</p>
69 ///
70 /// \note Much of the code in this class is borrowed from the class CSvmTrainer
71 template <class InputType, class CacheType = float>
72 class MissingFeatureSvmTrainer : public AbstractSvmTrainer<InputType, unsigned int,MissingFeaturesKernelExpansion<InputType> >
73 {
74 protected:
75 
76  typedef CacheType QpFloatType;
79 
80 public:
81 
82  /// Constructor
83  MissingFeatureSvmTrainer(KernelType* kernel, double C, bool offset, bool unconstrained = false)
84  :
85  base_type(kernel, C, offset, unconstrained),
86  m_maxIterations(4u)
87  { }
88 
89  /// \brief From INameable: return the class name.
90  std::string name() const
91  { return "MissingFeatureSvmTrainer"; }
92 
94  // Check prerequisites
95  SHARK_RUNTIME_CHECK(numberOfClasses(dataset) == 2, "Not a binary problem");
96 
98 
99  if(svm.hasOffset())
100  trainWithOffset(svm,dataset);
101  else
102  trainWithoutOffset(svm,dataset);
103  }
104 
105  void setMaxIterations(std::size_t newIterations)
106  { m_maxIterations = newIterations; }
107 
108 private:
109 
110  /// Number of iterations to run
111  std::size_t m_maxIterations;
112 
113  void trainWithoutOffset(MissingFeaturesKernelExpansion<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset)
114  {
115 
116  // Initialize scaling coefficients as 1.0
117  RealVector scalingCoefficients(dataset.numberOfElements(), 1.0);
118 
119  // What body of this loop does:
120  // *) Solve the QP with a normal solver treating s_i as constants
121  // *) Calculate norms: w_i and w
122  // *) Update s_i with w_i / w
123  for(std::size_t iteration = 0; iteration != m_maxIterations; ++iteration){
124  //Set up the problem
126  typedef CachedMatrix<MatrixType> CachedMatrixType;
127  typedef CSVMProblem<CachedMatrixType> SVMProblemType;
129  MatrixType kernelMatrix(*base_type::m_kernel, dataset.inputs());
130  kernelMatrix.setScalingCoefficients(scalingCoefficients);
131  CachedMatrixType matrix(&kernelMatrix);
132  SVMProblemType svmProblem(matrix,dataset.labels(),this->C());
133  ProblemType problem(svmProblem,base_type::m_shrinking);
134 
135  //solve it
136  QpSolver< ProblemType > solver(problem);
138  RealVector alpha = problem.getUnpermutedAlpha();
139 
140  //update s_i and w_i
141  const double classifierNorm = svm.computeNorm(alpha, scalingCoefficients);
142  SHARK_ASSERT(classifierNorm > 0.0);
143  for (std::size_t i = 0; i < scalingCoefficients.size(); ++i)
144  {
145  // Update scaling coefficients
146  scalingCoefficients(i) = svm.computeNorm(
147  alpha,
148  scalingCoefficients,
149  dataset.element(i).input)
150  / classifierNorm;
151  }
152 
153  //store alpha in the last iteration inside the svm
154  if(iteration == m_maxIterations-1)
155  column(svm.alpha(),0)= alpha;
156 
157  //keep track of number of kernel evaluations
158  base_type::m_accessCount += kernelMatrix.getAccessCount();
159  }
160  svm.setScalingCoefficients(scalingCoefficients);
161  }
163  {
164  // Initialize scaling coefficients as 1.0
165  std::size_t datasetSize = dataset.numberOfElements();
166  RealVector scalingCoefficients(datasetSize, 1.0);
167 
168  // What body of this loop does:
169  // *) Solve the QP with a normal solver treating s_i as constants
170  // *) Calculate norms: w_i and w
171  // *) Update s_i with w_i / w
172  for(std::size_t iteration = 0; iteration != m_maxIterations; ++iteration){
173  //Set up the problem
175  typedef CachedMatrix<MatrixType> CachedMatrixType;
176  typedef CSVMProblem<CachedMatrixType> SVMProblemType;
177  typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
178  MatrixType kernelMatrix(*base_type::m_kernel, dataset.inputs());
179  kernelMatrix.setScalingCoefficients(scalingCoefficients);
180  CachedMatrixType matrix(&kernelMatrix);
181  SVMProblemType svmProblem(matrix,dataset.labels(),this->C());
182  ProblemType problem(svmProblem,base_type::m_shrinking);
183 
184  //solve it
185  QpSolver< ProblemType > solver(problem);
187  RealVector unpermutedAlpha = problem.getUnpermutedAlpha();
188 
189  //update s_i and w_i
190  const double classifierNorm = svm.computeNorm(unpermutedAlpha, scalingCoefficients);
191  SHARK_ASSERT(classifierNorm > 0.0);
192  for (std::size_t i = 0; i < scalingCoefficients.size(); ++i)
193  {
194  // Update scaling coefficients
195  scalingCoefficients(i) = svm.computeNorm(
196  unpermutedAlpha,
197  scalingCoefficients,
198  dataset.element(i).input
199  )/ classifierNorm;
200  }
201 
202 
203  if(iteration == m_maxIterations-1){
204  //in the last tieration,y
205  // Compute the offset(i.e., b or Bias) and push it along with alpha to SVM
206  column(svm.alpha(),0)= unpermutedAlpha;
207  double lowerBound = -1e100;
208  double upperBound = 1e100;
209  double sum = 0.0;
210  std::size_t freeVars = 0;
211 
212  // No reason to init to 0, but avoid compiler warnings
213  for (std::size_t i = 0; i < datasetSize; i++)
214  {
215  // In case of no free SVs, we are looking for the largest gradient of all alphas at the lower bound
216  // and the smallest gradient of all alphas at the upper bound
217  const double value = problem.gradient(i);
218  if (problem.alpha(i) == problem.boxMin(i)){
219  lowerBound = std::max(value,lowerBound);
220  }
221  else if (problem.alpha(i) == problem.boxMax(i)){
222  upperBound = std::min(value,upperBound);
223  }
224  else{
225  sum += value;
226  freeVars++;
227  }
228  }
229 
230  if (freeVars > 0) {
231  // Stabilized (averaged) exact value
232  svm.offset(0) = sum / freeVars;
233  } else {
234  // TODO: need work out how to do the calculation of the derivative with missing features
235 
236  // Return best estimate
237  svm.offset(0) = 0.5 * (lowerBound + upperBound);
238  }
239  }
240 
241  //keep track of number of kernel evaluations
242  base_type::m_accessCount += kernelMatrix.getAccessCount();
243  }
244  svm.setScalingCoefficients(scalingCoefficients);
245  }
246 
247 };
248 
249 } // namespace shark {
250 
251 #endif