KDTree.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Tree for nearest neighbor search in low dimensions.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2011
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_ALGORITHMS_NEARESTNEIGHBORS_KDTREE_H
36 #define SHARK_ALGORITHMS_NEARESTNEIGHBORS_KDTREE_H
37 
38 
40 #include <shark/Data/DataView.h>
41 #include <shark/LinAlg/Base.h>
42 #include <shark/Core/Math.h>
43 namespace shark {
44 
45 
46 ///
47 /// \brief KD-tree, a binary space-partitioning tree
48 ///
49 /// \par
50 /// KD-tree data structure for efficient nearest
51 /// neighbor searches in low-dimensional spaces.
52 /// Low-dimensional means << 10 dimensions.
53 ///
54 /// \par
55 /// The tree is constructed from data by splitting
56 /// the dimension with largest extent (in the data
57 /// covered, not space represented by the cell),
58 /// recursively. An approximate median is used as
59 /// the cutting point, where the maximal number of
60 /// points used to estimate the median can be
61 /// controlled with the template parameter
62 /// MedianAccuracy.
63 ///
64 /// \par
65 /// The bucket size for the tree is aleays one,
66 /// such that there is a unique point in each leaf
67 /// cell.
68 ///
69 template <class InputT>
70 class KDTree : public BinaryTree<InputT>
71 {
72  typedef KDTree<InputT> self_type;
74 public:
75 
76  /// Construct the tree from data.
77  /// It is assumed that the container exceeds
78  /// the lifetime of the KDTree (which holds
79  /// only references to the points), and that
80  /// the memory locations of the points remain
81  /// unchanged.
83  : base_type(dataset.numberOfElements())
84  , m_cutDim(0xffffffff){
85  typedef DataView<Data<RealVector> const> PointSet;
86  PointSet points(dataset);
87  //create a list to the iterator elements as temporary storage
88  std::vector<typename boost::range_iterator<PointSet>::type> elements(m_size);
89  boost::iota(elements,boost::begin(points));
90 
91  buildTree(tc,elements);
92 
93  //after the creation of the trees, the iterators in the array are sorted in the order,
94  //they are referenced by the nodes.so we can create the indexList using the indizes of the iterators
95  for(std::size_t i = 0; i != m_size; ++i){
96  mp_indexList[i] = elements[i].index();
97  }
98  }
99 
100 
101  /// lower bound on the box-shaped
102  /// space represented by this cell
103  double lower(std::size_t dim) const{
104  self_type* parent = static_cast<self_type*>(mep_parent);
105  if (parent == NULL) return -1e100;
106 
107  if (parent->m_cutDim == dim && parent->mp_right == this)
108  return parent->threshold();
109  else
110  return parent->lower(dim);
111  }
112 
113  /// upper bound on the box-shaped
114  /// space represented by this cell
115  double upper(std::size_t dim) const{
116  self_type* parent = static_cast<self_type*>(mep_parent);
117  if (parent == NULL) return +1e100;
118 
119  if (parent->m_cutDim == dim && parent->mp_left == this)
120  return parent->threshold();
121  else
122  return parent->upper(dim);
123  }
124 
125  /// \par
126  /// Compute the squared Euclidean distance of
127  /// this cell to the given reference point, or
128  /// alternatively a lower bound on this value.
129  ///
130  /// \par
131  /// In the case of the kd-tree the computation
132  /// is exact, however, only a lower bound is
133  /// required in general, and other space
134  /// partitioning trees used in the future may
135  /// only be able to provide a lower bound, at
136  /// least with reasonable computational efforts.
137  double squaredDistanceLowerBound(InputT const& reference) const
138  {
139  double ret = 0.0;
140  for (std::size_t d = 0; d != reference.size(); d++)
141  {
142  double v = reference(d);
143  double l = lower(d);
144  double u = upper(d);
145  if (v < l){
146  ret += sqr(l-v);
147  }
148  else if (v > u){
149  ret += sqr(v-u);
150  }
151  }
152  return ret;
153  }
154 
155 #if 0
156  // debug code, please ignore
157  void print(unsigned int ident = 0) const
158  {
159  if (this->isLeaf())
160  {
161  for (unsigned int j=0; j<m_size; j++)
162  {
163  for (unsigned int i=0; i<ident; i++) printf(" ");
164  printf("index: %d\n", (int)this->index(j));
165  }
166  }
167  else
168  {
169  for (unsigned int i=0; i<ident; i++) printf(" ");
170  printf("x[%d] < %g\n", (int)m_cutDim, this->threshold());
171  for (unsigned int i=0; i<ident; i++) printf(" ");
172  printf("[%d]\n", (int)mp_left->size());
173  ((self_type*)mp_left)->print(ident + 1);
174  for (unsigned int i=0; i<ident; i++) printf(" ");
175  printf("[%d]\n", (int)mp_right->size());
176  ((self_type*)mp_right)->print(ident + 1);
177  }
178  }
179 #endif
180 
181 protected:
182  using base_type::mep_parent;
183  using base_type::mp_left;
184  using base_type::mp_right;
186  using base_type::m_size;
187  using base_type::m_nodes;
188 
189  /// (internal) construction of a non-root node
190  KDTree(KDTree* parent, std::size_t* list, std::size_t size)
191  : base_type(parent, list, size)
192  , m_cutDim(0xffffffff)
193  { }
194 
195  /// (internal) construction method:
196  /// median-cuts of the dimension with widest spread
197  template<class Range>
198  void buildTree(TreeConstruction tc, Range& points){
199  typedef typename boost::range_iterator<Range>::type iterator;
200 
201  iterator begin = boost::begin(points);
202  iterator end = boost::end(points);
203 
204  if (tc.maxDepth() == 0 || m_size <= tc.maxBucketSize()){
205  m_nodes = 1;
206  return;
207  }
208 
210  if (m_cutDim == (*begin)->size()){
211  m_nodes = 1;
212  return;
213  }
214 
215  // calculate the distance of the boundary for every point in the list
216  std::vector<double> distance(m_size);
217  iterator point = begin;
218  for(std::size_t i = 0; i != m_size; ++i,++point){
219  distance[i] = (**point)[m_cutDim];
220  }
221 
222  // split the list into sub-cells
223  iterator split = this->splitList(distance,points);
224  std::size_t leftSize = split-begin;
225 
226  // create sub-nodes
227  mp_left = new KDTree(this, mp_indexList, leftSize);
228  mp_right = new KDTree(this, mp_indexList + leftSize, m_size - leftSize);
229 
230  // recurse
231  boost::iterator_range<iterator> left(begin,split);
232  boost::iterator_range<iterator> right(split,end);
233  ((KDTree*)mp_left)->buildTree(tc.nextDepthLevel(), left);
234  ((KDTree*)mp_right)->buildTree(tc.nextDepthLevel(), right);
235  m_nodes = 1 + mp_left->nodes() + mp_right->nodes();
236  }
237 
238  ///\brief Calculate the dimension which should be cut by this node
239  ///
240  ///The KD-Tree calculates the Axis-Aligned-Bounding-Box surrounding the points
241  ///and splits the longest dimension
242  template<class Range>
243  std::size_t calculateCuttingDimension(Range const& points)const{
244  typedef typename boost::range_iterator<Range const>::type iterator;
245 
246  iterator begin = boost::begin(points);
247  // iterator end = boost::end(points);
248 
249  // calculate bounding box of the data
250  InputT L = **begin;
251  InputT U = **begin;
252  std::size_t dim = L.size();
253  iterator point = begin;
254  ++point;
255  for (std::size_t i=1; i != m_size; ++i,++point){
256  for (std::size_t d = 0; d != dim; d++){
257  double v = (**point)[d];
258  if (v < L[d]) L[d] = v;
259  if (v > U[d]) U[d] = v;
260  }
261  }
262 
263  // find the longest edge of the bounding box
264  std::size_t cutDim = 0;
265  double extent = U[0] - L[0];
266  for (std::size_t d = 1; d != dim; d++)
267  {
268  double e = U[d] - L[d];
269  if (e > extent)
270  {
271  extent = e;
272  cutDim = d;
273  }
274  }
275  if(extent == 0)
276  return dim;
277  return cutDim;
278  }
279 
280  /// Function describing the separating hyperplane
281  /// as its zero set. It is guaranteed that the
282  /// gradient has norm one, thus the absolute value
283  /// of the function indicates distance from the
284  /// hyperplane.
285  double funct(InputT const& reference) const{
286  return reference[m_cutDim];
287  }
288 
289  /// split/cut dimension of this node
290  std::size_t m_cutDim;
291 };
292 
293 
294 }
295 #endif