JaakkolaHeuristic.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3 * \brief Jaakkola's heuristic and related quantities for Gaussian kernel selection
4 *
5 * \author T. Glasmachers, O. Krause, C. Igel
6 * \date 2010
7 *
8 *
9 * \par Copyright (c) 1999-2010:
10 * Institut f&uuml;r Neuroinformatik<BR>
11 * Ruhr-Universit&auml;t Bochum<BR>
12 * D-44780 Bochum, Germany<BR>
13 * Phone: +49-234-32-25558<BR>
14 * Fax: +49-234-32-14209<BR>
15 * eMail: Shark-admin@neuroinformatik.ruhr-uni-bochum.de<BR>
16 * www: http://www.neuroinformatik.ruhr-uni-bochum.de<BR>
17 *
18 *
19 *
20 * <BR><HR>
21 * This file is part of Shark. This library is free software;
22 * you can redistribute it and/or modify it under the terms of the
23 * GNU General Public License as published by the Free Software
24 * Foundation; either version 3, or (at your option) any later version.
25 *
26 * This library is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with this library; if not, see <http://www.gnu.org/licenses/>.
33 *
34 */
35 //===========================================================================
36 
37 
38 #ifndef SHARK_ML_JAAKOLAHEURISTIC_H
39 #define SHARK_ML_JAAKOLAHEURISTIC_H
40 
41 
42 #include <shark/Data/Dataset.h>
43 #include <algorithm>
44 
45 namespace shark{
46 
47 
48 /// \brief Jaakkola's heuristic and related quantities for Gaussian kernel selection
49 ///
50 /// \par
51 /// Jaakkola's heuristic method for setting the width parameter of the
52 /// Gaussian radial basis function kernel is to pick a quantile (usually
53 /// the median) of the distribution of Euclidean distances between points
54 /// having different labels. The present implementation computes the kernel
55 /// width \f$ \sigma \f$ and the bandwidth
56 /// \f[ \gamma = \frac{1}{2 \sigma^2} \f]
57 /// based on the median or on any other quantile of the empirical
58 /// distribution.
59 ///
60 /// In the original paper, only the distance to the closest point with
61 /// different label is considered. This behavior can be turned on by
62 /// an option of the constructor.
64 {
65 public:
66  /// Constructor
67  /// \param data vector-valued input data
68  /// \param nearestFalseNeighbor if true, only the nearest neighboring point with different label is considered (default false)
69  template<class InputType>
70  JaakkolaHeuristic(LabeledData<InputType,unsigned int> const& dataset, bool nearestFalseNeighbor = false)
71  {
73  Elements elements = dataset.elements();
74  if(!nearestFalseNeighbor) {
75  for(typename Elements::iterator it = elements.begin(); it != elements.end(); ++it){
76  typename Elements::iterator itIn = it;
77  itIn++;
78  for (; itIn != elements.end(); itIn++) {
79  if (itIn->label == it->label) continue;
80  double dist = distanceSqr(it->input,itIn->input);
81  m_stat.push_back(dist);
82  }
83  }
84  } else {
85  for(typename Elements::iterator it = elements.begin(); it != elements.end(); ++it){
86  double minDistSqr = 0;
87  for (typename Elements::iterator itIn = elements.begin(); itIn != elements.end(); itIn++) {
88  if (itIn->label == it->label) continue;
89  double dist = distanceSqr(it->input,itIn->input);
90  if( (minDistSqr == 0) || (dist < minDistSqr)) minDistSqr = dist;
91  }
92  m_stat.push_back(minDistSqr);
93  }
94  }
95  std::sort(m_stat.begin(), m_stat.end());
96  }
97 
98  /// Compute the given quantile (usually median)
99  /// of the empirical distribution of Euclidean distances
100  /// of data pairs with different labels.
101  double sigma(double quantile = 0.5)
102  {
103  std::size_t ic = m_stat.size();
104  SHARK_ASSERT(ic > 0);
105 
106  std::sort(m_stat.begin(), m_stat.end());
107 
108  if (quantile < 0.0)
109  {
110  // TODO: find minimum
111  return std::sqrt(m_stat[0]);
112  }
113  if (quantile >= 1.0)
114  {
115  // TODO: find maximum
116  return std::sqrt(m_stat[ic-1]);
117  }
118  else
119  {
120  // TODO: partial sort!
121  double t = quantile * (ic - 1);
122  std::size_t i = (std::size_t)floor(t);
123  double rest = t - i;
124  return ((1.0 - rest) * std::sqrt(m_stat[i]) + rest * std::sqrt(m_stat[i+1]));
125  }
126  }
127 
128  /// Compute the given quantile (usually the median)
129  /// of the empirical distribution of Euclidean distances
130  /// of data pairs with different labels converted into
131  /// a value usable as the gamma parameter of the GaussianRbfKernel.
132  double gamma(double quantile = 0.5)
133  {
134  double s = sigma(quantile);
135  return 0.5 / (s * s);
136  }
137 
138 
139 protected:
140  /// all pairwise distances
141  std::vector<double> m_stat;
142 };
143 
144 }
145 #endif