KHCTree.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Tree for nearest neighbor search in kernel-induced feature spaces.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2011
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_ALGORITHMS_NEARESTNEIGHBORS_KHCTREE_H
36 #define SHARK_ALGORITHMS_NEARESTNEIGHBORS_KHCTREE_H
37 
38 
40 #include <shark/LinAlg/Base.h>
41 #include <boost/array.hpp>
42 
43 namespace shark {
44 
45 
46 ///
47 /// \brief KHC-tree, a binary space-partitioning tree
48 ///
49 /// \par
50 /// KHC-tree data structure for efficient nearest
51 /// neighbor searches in kernel-induced feature
52 /// spaces.
53 /// "KHC" stands for Kernel Hierarchical Clustering.
54 /// The space separation is based on distances to
55 /// cluster centers.
56 ///
57 /// \par
58 /// The tree is constructed from data by splitting
59 /// along the pair of data points with largest
60 /// distance. This quantity is approximated using
61 /// a given number of randomly sampled pairs, which
62 /// is controlled by the template parameter
63 /// CuttingAccuracy.
64 ///
65 /// \par
66 /// The bucket size for the tree is always one,
67 /// such that there is a unique point in each leaf
68 /// cell.
69 ///
70 ///Since the KHCTree needs direct access to the elements, it's template parameter is not the actual point type
71 ///But the Range, the points are stored in. Be aware that this range should be a View when a Dataset is used as storage,
72 ///since during construction, the KHC-Tree needs random access to the elements.
73 template <class Container, int CuttingAccuracy = 25>
74 class KHCTree : public BinaryTree<typename Container::value_type>
75 {
76 public:
78  typedef typename Container::value_type value_type;
81 
82  /// Construct the tree from data.
83  /// It is assumed that the container exceeds
84  /// the lifetime of the KHCTree (which holds
85  /// only references to the points), and that
86  /// the memory locations of the points remain
87  /// unchanged.
88  KHCTree(Container const& points, kernel_type const* kernel, TreeConstruction tc = TreeConstruction())
89  : base_type(points.size())
90  , mep_kernel(kernel)
91  , m_normalInvNorm(1.0)
92  {
93  //create a list to the iterator elements as temporary storage
94  //we need indexed operators to have a fast lookup of the position of the elements in the container
95  std::vector<const_iterator> elements(m_size);
96  boost::iota(elements,const_iterator(boost::begin(points),0));
97 
98  buildTree(tc,elements);
99 
100  //after the creation of the trees, the iterators in the array are sorted in the order,
101  //they are referenced by the nodes.so we can create the indexList using the indizes of the iterators
102  for(std::size_t i = 0; i != m_size; ++i){
103  mp_indexList[i] = elements[i].index();
104  }
105  }
106 
107 
108  /// \par
109  /// Compute the squared distance of this cell to
110  /// the given reference point, or alternatively
111  /// a lower bound on this value.
112  double squaredDistanceLowerBound(value_type const& reference) const{
113  double dist = 0.0;
114  KHCTree const* t = this;
115  KHCTree const* p = (KHCTree const*)mep_parent;
116  while (p != NULL){
117  double v = p->distanceFromPlane(reference);
118  if (t == p->mp_right)
119  v = -v;
120  if (v > dist)
121  dist = v;
122  t = p;
123  p = (KHCTree const*)p->mep_parent;
124  }
125  return (dist * dist);
126  }
127 
128 protected:
129  using base_type::mep_parent;
130  using base_type::mp_left;
131  using base_type::mp_right;
133  using base_type::m_size;
134  using base_type::m_nodes;
135 
136  /// (internal) construction of a non-root node
137  KHCTree(KHCTree* parent, std::size_t* list, std::size_t size)
138  : base_type(parent, list, size)
139  , mep_kernel(parent->mep_kernel)
140  , m_normalInvNorm(1.0)
141  { }
142 
143  /// (internal) construction method:
144  /// median-cuts of the dimension with widest spread
145  template<class Range>
146  void buildTree(TreeConstruction tc, Range& points){
147  typedef typename Range::iterator range_iterator;
148 
149  //check whether we are finished
150  if (tc.maxDepth() == 0 || m_size <= tc.maxBucketSize()) {
151  m_nodes = 1;
152  return;
153  }
154 
155  // use only a subset of size at most CuttingAccuracy
156  // to estimate the vector along the longest
157  // distance
158  if (m_size <= CuttingAccuracy){
159  calculateNormal(points);
160  }
161  else{
162  boost::array<const_iterator,CuttingAccuracy> samples;
163  for(std::size_t i = 0; i != CuttingAccuracy; i++)
164  samples[i] = points[m_size * (2*i+1) / (2*CuttingAccuracy)];
165  calculateNormal(samples);
166  }
167 
168  //calculate the distance from the plane for every point in the list
169  std::vector<double> distance(m_size);
170  for(std::size_t i = 0; i != m_size; ++i){
171  distance[i] = funct(*points[i]);
172  }
173 
174 
175  // split the list into sub-cells
176  range_iterator split = this->splitList(distance,points);
177  range_iterator begin = boost::begin(points);
178  range_iterator end = boost::end(points);
179 
180  if (split == end) {//can't split points.
181  m_nodes = 1;
182  return;
183  }
184 
185  // create sub-nodes
186  std::size_t leftSize = split-begin;
187  mp_left = new KHCTree(this, mp_indexList, leftSize);
188  mp_right = new KHCTree(this, mp_indexList + leftSize, m_size - leftSize);
189 
190  // recurse
191  boost::iterator_range<range_iterator> left(begin,split);
192  boost::iterator_range<range_iterator> right(split,end);
193  ((KHCTree*)mp_left)->buildTree(tc.nextDepthLevel(),left);
194  ((KHCTree*)mp_right)->buildTree(tc.nextDepthLevel(),right);
195  m_nodes = 1 + mp_left->nodes() + mp_right->nodes();
196  }
197 
198  template<class Range>
199  void calculateNormal(Range const& samples){
200  std::size_t numSamples =batchSize(samples);
201  double best_dist2 = -1.0;
202  for (std::size_t i=1; i != numSamples; i++){
203  for (std::size_t j = 0; j != i; j++){
204  double dist2 = mep_kernel->featureDistanceSqr(*samples[i], *samples[j]);
205  if (dist2 > best_dist2){
206  best_dist2 = dist2;
207  mep_positive = samples[i];
208  mep_negative = samples[j];
209  }
210  }
211  }
212  m_normalInvNorm = 1.0 / std::sqrt(best_dist2);
213  if (! (boost::math::isfinite)(m_normalInvNorm))
214  m_normalInvNorm = 1.0;
215  }
216 
217 
218 
219  /// function describing the separating hyperplane
220  double funct(value_type const& reference) const{
221  double result = mep_kernel->eval(*mep_positive, reference);
222  result -= mep_kernel->eval(*mep_negative, reference);
223  result *= m_normalInvNorm;
224  return result;
225  }
226 
227  /// kernel function
228  kernel_type const* mep_kernel;
229 
230  /// "positive" side cluster center
231  const_iterator mep_positive;
232 
233  /// "negative" side cluster center
234  const_iterator mep_negative;
235 
236  /// one divided by squared distance between "positive" and "negative" cluster center
238 };
239 
240 
241 }
242 #endif