KMeans.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief The k-means clustering algorithm.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2011
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_KMEANS_H
37 #define SHARK_ALGORITHMS_KMEANS_H
38 
39 #include <shark/Core/DLLSupport.h>
40 #include <shark/Data/Dataset.h>
42 #include <shark/Models/RBFLayer.h>
45 
46 
47 namespace shark{
48 
49 
50 ///
51 /// \brief The k-means clustering algorithm.
52 ///
53 /// \par
54 /// The k-means algorithm takes vector-valued data
55 /// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$
56 /// and splits it into k clusters, based on centroids
57 /// \f$ \{c_1, \dots, c_k\} \f$.
58 /// The result is stored in a Centroids object that can be used to
59 /// construct clustering models.
60 ///
61 /// \par
62 /// This implementation starts the search with the given centroids,
63 /// in case the provided centroids object (third parameter) contains
64 /// a set of k centroids. Otherwise the search starts from the first
65 /// k data points.
66 ///
67 /// \par
68 /// Note that the data set needs to include at least k data points
69 /// for k-means to work. This is because the current implementation
70 /// does not allow for empty clusters.
71 ///
72 /// \param data vector-valued data to be clustered
73 /// \param k number of clusters
74 /// \param centroids centroids input/output
75 /// \param maxIterations maximum number of k-means iterations; 0: unlimited
76 /// \return number of k-means iterations
77 ///
78 SHARK_EXPORT_SYMBOL std::size_t kMeans(Data<RealVector> const& data, std::size_t k, Centroids& centroids, std::size_t maxIterations = 0);
79 
80 ///
81 /// \brief The k-means clustering algorithm for initializing an RBF Layer
82 ///
83 /// \par
84 /// The k-means algorithm takes vector-valued data
85 /// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$
86 /// and splits it into k clusters, based on centroids
87 /// \f$ \{c_1, \dots, c_k\} \f$.
88 /// The result is stored in a RBFLayer object that can be used to
89 /// construct clustering models.
90 ///
91 /// \par
92 /// This is just an alternative frontend to the version using Centroids. it creates a centroid object,
93 /// with as many clusters as are outputs in the RBFLayer and copies the result into the model.
94 ///
95 /// \par
96 /// Note that the data set needs to include at least k data points
97 /// for k-means to work. This is because the current implementation
98 /// does not allow for empty clusters.
99 ///
100 /// \param data vector-valued data to be clustered
101 /// \param model RBFLayer input/output
102 /// \param maxIterations maximum number of k-means iterations; 0: unlimited
103 /// \return number of k-means iterations
104 ///
105 SHARK_EXPORT_SYMBOL std::size_t kMeans(Data<RealVector> const& data, RBFLayer& model, std::size_t maxIterations = 0);
106 
107 ///
108 /// \brief The kernel k-means clustering algorithm
109 ///
110 /// \par
111 /// The kernel k-means algorithm takes data
112 /// \f$ \{x_1, \dots, x_n\} \f$
113 /// and splits it into k clusters, based on centroids
114 /// \f$ \{c_1, \dots, c_k\} \f$.
115 /// The centroids are elements of the reproducing kernel Hilbert space
116 /// (RHKS) induced by the kernel function. They are functions, represented
117 /// as the components of a KernelExpansion object. I.e., given a data point
118 /// x, the kernel expansion returns a k-dimensional vector f(x), which is
119 /// the evaluation of the centroid functions on x. The value of the
120 /// centroid function represents the inner product of the centroid with
121 /// the kernel-induced feature vector of x (embedding of x into the RKHS).
122 /// The distance of x from the centroid \f$ c_i \f$ is computes as the
123 /// kernel-induced distance
124 /// \f$ \sqrt{ kernel(x, x) + kernel(c_i, c_i) - 2 kernel(x, c_i) } \f$.
125 /// For the Gaussian kernel (and other normalized kernels) is simplifies to
126 /// \f$ \sqrt{ 2 - 2 kernel(x, c_i) } \f$. Hence, larger function values
127 /// indicate smaller distance to the centroid.
128 ///
129 /// \par
130 /// Note that the data set needs to include at least k data points
131 /// for k-means to work. This is because the current implementation
132 /// does not allow for empty clusters.
133 ///
134 /// \param dataset vector-valued data to be clustered
135 /// \param k number of clusters
136 /// \param kernel kernel function object
137 /// \param maxIterations maximum number of k-means iterations; 0: unlimited
138 /// \return centroids (represented as functions, see description)
139 ///
140 template<class InputType>
141 KernelExpansion<InputType> kMeans(Data<InputType> const& dataset, std::size_t k, AbstractKernelFunction<InputType>& kernel, std::size_t maxIterations = 0){
142  if(!maxIterations)
143  maxIterations = std::numeric_limits<std::size_t>::max();
144 
145  std::size_t n = dataset.numberOfElements();
146  RealMatrix kernelMatrix = calculateRegularizedKernelMatrix(kernel,dataset,0);
147  UIntVector clusterMembership(n);
148  UIntVector clusterSizes(k,0);
149  RealVector ckck(k,0);
150 
151  //init cluster assignments
152  for(unsigned int i = 0; i != n; ++i){
153  clusterMembership(i) = i % k;
154  }
155  std::shuffle(clusterMembership.begin(),clusterMembership.end(),random::globalRng);
156  for(std::size_t i = 0; i != n; ++i){
157  ++clusterSizes(clusterMembership(i));
158  }
159 
160  // k-means loop
161  std::size_t iter = 0;
162  bool equal = false;
163  for(; iter != maxIterations && !equal; ++iter) {
164  //the clustering model results in centers c_k= 1/n_k sum_i k(x_i,.) for all x_i points of cluster k
165  //we need to compute the squared distances between all centers and points that is
166  //d^2(c_k,x_i) = <c_k,c_k> -2 < c_k,x_i> + <x_i,x_i> for the i-th point.
167  //thus we precompute <c_k,c_k>= sum_ij k(x_i,x_j)/(n_k)^2 for all x_i,x_j points of cluster k
168  ckck.clear();
169  for(std::size_t i = 0; i != n; ++i){
170  std::size_t c1 = clusterMembership(i);
171  for(std::size_t j = 0; j != n; ++j){
172  std::size_t c2 = clusterMembership(j);
173  if(c1 != c2) continue;
174  ckck(c1) += kernelMatrix(i,j);
175  }
176  }
177  noalias(ckck) = safe_div(ckck,sqr(clusterSizes),0);
178 
179  UIntVector newClusterMembership(n);
180  RealVector currentDistances(k);
181  for(std::size_t i = 0; i != n; ++i){
182  //compute squared distances between the i-th point and the centers
183  //we skip <x_i,x_i> as it is always the same for all elements and we don't need it for comparison
184  noalias(currentDistances) = ckck;
185  for(std::size_t j = 0; j != n; ++j){
186  std::size_t c = clusterMembership(j);
187  currentDistances(c) -= 2* kernelMatrix(i,j)/clusterSizes(c);
188  }
189  //choose the index with the smallest distance as new cluster
190  newClusterMembership(i) = (unsigned int) arg_min(currentDistances);
191  }
192  equal = boost::equal(
193  newClusterMembership,
194  clusterMembership
195  );
196  noalias(clusterMembership) = newClusterMembership;
197  //compute new sizes of clusters
198  clusterSizes.clear();
199  for(std::size_t i = 0; i != n; ++i){
200  ++clusterSizes(clusterMembership(i));
201  }
202 
203  //if a cluster is empty then assign a random point to it
204  for(unsigned int i = 0; i != k; ++i){
205  if(clusterSizes(i) == 0){
206  std::size_t elem = random::uni(random::globalRng, std::size_t(0), n-1);
207  --clusterSizes(clusterMembership(elem));
208  clusterMembership(elem)=i;
209  clusterSizes(i) = 1;
210  }
211  }
212  }
213 
214  //copy result in the expansion
215  KernelExpansion<InputType> expansion;
216  expansion.setStructure(&kernel,dataset,true,k);
217  expansion.offset() = -ckck;
218  expansion.alpha().clear();
219  for(std::size_t i = 0; i != n; ++i){
220  std::size_t c = clusterMembership(i);
221  expansion.alpha()(i,c) = 2.0 / clusterSizes(c);
222  }
223 
224  return expansion;
225 }
226 
227 } // namespace shark
228 #endif