LCTree.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Tree for nearest neighbor search in data with low embedding dimension.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2011
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_ALGORITHMS_NEARESTNEIGHBORS_LCTREE_H
36 #define SHARK_ALGORITHMS_NEARESTNEIGHBORS_LCTREE_H
37 
38 
40 #include <shark/Data/DataView.h>
41 #include <shark/LinAlg/Base.h>
42 #include <boost/array.hpp>
43 
44 namespace shark {
45 
46 
47 ///
48 /// \brief LC-tree, a binary space-partitioning tree
49 ///
50 /// \par
51 /// LC-tree data structure for efficient nearest
52 /// neighbor searches in possibly high-dimensional
53 /// spaces, but with low-dimensional effective data
54 /// dimension (means << 10 dimensions).
55 /// "LC" stands for Linear Cut.
56 ///
57 /// This tree requires the existence of a function
58 /// inner_prod computing the standard inner product
59 /// of two objects of type VectorType, and a function
60 /// distanceSqr computing the squared Euclidean
61 /// distance between two vectors.
62 ///
63 /// \par
64 /// The tree is constructed from data by splitting
65 /// the direction with largest extent (in the data
66 /// covered, not space represented by the cell),
67 /// recursively. Approximate direction and median
68 /// are used to determine the cutting hyperplane,
69 /// where the maximal number of points used to
70 /// estimate the median can be controlled with the
71 /// template parameter CuttingAccuracy.
72 ///
73 /// \par
74 /// The bucket size for the tree is always one,
75 /// such that there is a unique point in each leaf
76 /// cell.
77 ///
78 template <class VectorType = RealVector, int CuttingAccuracy = 25>
79 class LCTree : public BinaryTree<VectorType>
80 {
82 public:
83  /// Construct the tree from data.
84  /// It is assumed that the container exceeds
85  /// the lifetime of the LCTree (which holds
86  /// only references to the points), and that
87  /// the memory locations of the points remain
88  /// unchanged.
90  : base_type(dataset.numberOfElements())
91  , m_normal(dataDimension(dataset)){
92  typedef DataView<Data<RealVector> const> PointSet;
93  PointSet points(dataset);
94  //create a list to the iterator elements as temporary storage
95  //we need indexed operators to have a fast lookup of the position of the elements in the container
97  std::vector<iterator> elements(m_size);
98  boost::iota(elements,iterator(boost::begin(points),0));
99 
100  buildTree(tc,elements);
101  //after the creation of the trees, the iterators in the array are sorted in the order,
102  //they are referenced by the nodes.so we can create the indexList using the indizes of the iterators
103  for(std::size_t i = 0; i != m_size; ++i){
104  mp_indexList[i] = elements[i].index();
105  }
106  }
107 
108 
109  /// \par
110  /// Compute the squared Euclidean distance of
111  /// this cell to the given reference point, or
112  /// alternatively a lower bound on this value.
113  double squaredDistanceLowerBound(VectorType const& reference) const{
114  double dist = 0.0;
115  LCTree const* t = this;
116  LCTree const* p = (LCTree const*)mep_parent;
117  while (p != NULL)
118  {
119  double v = p->distanceFromPlane(reference);
120  if (t == p->mp_right)
121  v = -v;
122  if (v > dist)
123  dist = v;
124  t = p;
125  p = (LCTree const*)p->mep_parent;
126  }
127  return dist * dist;
128  }
129 
130 protected:
131  using base_type::mep_parent;
132  using base_type::mp_left;
133  using base_type::mp_right;
135  using base_type::m_size;
136  using base_type::m_nodes;
137 
138  /// (internal) construction of a non-root node
139  LCTree(LCTree* parent, std::size_t* list, std::size_t size)
140  : base_type(parent, list, size){}
141 
142  /// (internal) construction method:
143  /// median-cuts of the dimension with widest spread
144  template<class Range>
145  void buildTree(TreeConstruction tc, Range& points){
146  typedef typename Range::value_type pointIterator;
147  typedef typename Range::iterator iterator;
148 
149  //check whether we are finished
150  if (tc.maxDepth() == 0 || m_size <= tc.maxBucketSize()) {
151  m_nodes = 1;
152  return;
153  }
154 
155  // use only a subset of size at most CuttingAccuracy
156  // to estimate the vector along the longest
157  // distance
158  if (m_size <= CuttingAccuracy){
159  calculateNormal(points);
160  }
161  else{
162  boost::array<pointIterator,CuttingAccuracy> samples;
163  for(std::size_t i = 0; i != CuttingAccuracy; i++)
164  samples[i] = points[m_size * (2*i+1) / (2*CuttingAccuracy)];
165  calculateNormal(samples);
166  }
167 
168  //calculate the distance from the plane for every point in the list
169  std::vector<double> distance(m_size);
170  for(std::size_t i = 0; i != m_size; ++i){
171  distance[i] = inner_prod(m_normal, *points[i]);
172  }
173 
174 
175  // split the list into sub-cells
176  iterator split = this->splitList(distance,points);
177  iterator begin = boost::begin(points);
178  iterator end = boost::end(points);
179 
180  if (split == end) {//can't split points.
181  m_nodes = 1;
182  return;
183  }
184 
185  // create sub-nodes
186  std::size_t leftSize = split-begin;
187  mp_left = new LCTree(this, mp_indexList, leftSize);
188  mp_right = new LCTree(this, mp_indexList + leftSize, m_size - leftSize);
189 
190  // recurse
191  boost::iterator_range<iterator> left(begin,split);
192  boost::iterator_range<iterator> right(split,end);
193  ((LCTree*)mp_left)->buildTree(tc.nextDepthLevel(),left);
194  ((LCTree*)mp_right)->buildTree(tc.nextDepthLevel(),right);
195  m_nodes = 1 + mp_left->nodes() + mp_right->nodes();
196  }
197 
198  /// function describing the separating hyperplane
199  double funct(VectorType const& reference) const{
200  return inner_prod(m_normal, reference);
201  }
202 
203  //find the longest distance between vectors in the sample set and calculate
204  //the normal along this direction
205  template<class Range>
206  void calculateNormal(Range const& samples){
207  std::size_t numSamples = samples.size();
208  std::size_t besti = 0;
209  std::size_t bestj = 0;
210  double best_dist2 = -1.0;
211  for (std::size_t i = 1; i != numSamples; i++){
212  for (std::size_t j = 0; j != i; j++){
213  double dist2 = distanceSqr(*samples[i], *samples[j]);
214  if (dist2 > best_dist2){
215  best_dist2 = dist2;
216  besti = i;
217  bestj = j;
218  }
219  }
220  }
221  double factor = 1.0 / std::sqrt(best_dist2);
222  if (! (boost::math::isfinite)(factor))
223  factor = 1.0;
224 
225  m_normal = factor * (*samples[besti] - *samples[bestj]);
226  }
227 
228  /// split/cut normal vector of this node
230 };
231 
232 
233 }
234 #endif