ExportKernelMatrix.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief export precomputed kernel matrices (using libsvm format)
6  *
7  *
8  *
9  * \author M. Tuma
10  * \date 2012
11  *
12  *
13  * \par Copyright 1995-2017 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://shark-ml.org/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_DATA_PRECOMPUTEDMATRIX_H
36 #define SHARK_DATA_PRECOMPUTEDMATRIX_H
37 
38 
39 
40 #include <fstream>
41 #include <shark/Data/Dataset.h>
42 #include <shark/Data/DataView.h>
46 
47 
48 namespace shark
49 {
50 
51 /**
52  * \ingroup shark_globals
53  *
54  * @{
55  */
56 
58 {
59  NONE, // no normalization. output regular Gram kernel matrix
60  MULTIPLICATIVE_TRACE_ONE, // determine the trace, and devide each entry by it
61  MULTIPLICATIVE_TRACE_N, // determine the trace, devide each entry by it, then multiply by the number of samples
62  MULTIPLICATIVE_VARIANCE_ONE, // normalize to unit variance in feature space. see kloft in jmlr 2012.
63  CENTER_ONLY, // center the kernel in feature space. see cortes in jmlr 2012 and in icml 2010.
64  CENTER_AND_MULTIPLICATIVE_TRACE_ONE // first center the kernel in featrue space. then devide each entry by the centered kernel's trace.
65 };
66 
67 /// \brief Write a kernel Gram matrix to stream.
68 ///
69 /// \param dataset data basis for the Gram matrix
70 /// \param kernel pointer to kernel function to be used
71 /// \param out The stream to be written to
72 /// \param normalizer what kind of normalization to apply. see enum declaration for details.
73 /// \param scientific should the output be in scientific notation?
74 /// \param fieldwidth field width for pretty printing
75 template<typename InputType, typename LabelType>
77  LabeledData<InputType, LabelType> const &dataset,
78  AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
79  std::ostream &out, // The stream to be written to
80  KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
81  bool scientific = false, // scientific notation?
82  unsigned int fieldwidth = 0 // for pretty-printing
83 )
84 {
85  //get access to the range of elements
86  DataView<LabeledData<InputType, LabelType> const> points(dataset);
87  std::size_t size = points.size();
88 
89  SIZE_CHECK(size != 0);
90  // check outstream status
91  if(!out)
92  {
93  throw(std::invalid_argument("[export_kernel_matrix] Can't write to stream."));
94  }
95 
96  // COMPUTE MODIFIERS
97 
98  // if multiplicative trace normalization: determine trace
99  double trace = 0.0;
100  double trace_factor = 1.0;
101  if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
102  {
103  for(auto point: points)
104  {
105  trace += kernel.eval(point.input, point.input);
106  }
107  SHARK_ASSERT(trace > 0);
108  trace_factor = 1.0 / trace;
109  if(normalizer == MULTIPLICATIVE_TRACE_N)
110  {
111  trace_factor *= size;
112  }
113  }
114 
115  // if multiplicative variance normalization: determine factor
116  double variance_factor = 0.0;
117  if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
118  {
119  ScaledKernel<InputType> scaled(&kernel);
121  normalizer.train(scaled, dataset.inputs());
122  variance_factor = scaled.factor();
123  }
124 
125  // if centering: determine matrix- and row-wise means;
126  double mean = 0;
127  RealVector rowmeans(size, 0.0);
128  if(normalizer == CENTER_ONLY || normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
129  {
130  // initialization: calculate mean and rowmeans
131  for(std::size_t i = 0; i < size; i++)
132  {
133  double k = kernel.eval(points[i].input, points[i].input);
134  mean += k; //add diagonal value to mean once
135  rowmeans(i) += k; //add diagonal to its rowmean
136  for(std::size_t j = 0; j < i; j++)
137  {
138  double k = kernel.eval(points[i].input, points[j].input);
139  mean += 2.0 * k; //add off-diagonals to mean twice
140  rowmeans(i) += k; //add to mean of row
141  rowmeans(j) += k; //add to mean of transposed row
142  }
143  }
144  mean = mean / (double) size / (double) size;
145  rowmeans /= size;
146  // get trace if necessary
147  if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
148  {
149  trace = 0.0;
150  for(std::size_t i = 0; i < size; i++)
151  {
152  trace += kernel.eval(points[i].input, points[i].input) - 2 * rowmeans(i) + mean;
153  }
154  SHARK_ASSERT(trace > 0);
155  trace_factor = 1.0 / trace;
156  }
157  }
158 
159  // FIX OUTPUT FORMAT
160 
161  // set output format
162  if(scientific)
163  out.setf(std::ios_base::scientific);
164  std::streamsize ss = out.precision();
165  out.precision(10);
166 
167  // determine dataset type
168  double max_label = -1e100;
169  double min_label = -max_label;
170  bool binary = false;
171  bool regression = false;
172  for(double cur_label: dataset.labels().elements())
173  {
174  if(cur_label > max_label)
175  max_label = cur_label;
176  if(cur_label < min_label)
177  min_label = cur_label;
178  if((cur_label != (int)cur_label) || cur_label < 0)
179  regression = true;
180  }
181  if(!regression && (min_label == 0) && (max_label == 1))
182  binary = true;
183 
184  // WRITE OUT
185 
186  // write to file:
187  // loop through examples (rows)
188  for(std::size_t i = 0; i < size; i++)
189  {
190 
191  // write label
192  if(regression)
193  {
194  out << std::setw(fieldwidth) << std::left << points[i].label << " ";
195  }
196  else if(binary)
197  {
198  out << std::setw(fieldwidth) << std::left << (int)(points[i].label * 2 - 1) << " ";
199  }
200  else
201  {
202  out << std::setw(fieldwidth) << std::left << (unsigned int)(points[i].label + 1) << " ";
203  }
204 
205  out << "0:" << std::setw(fieldwidth) << std::left << i + 1; //write index
206 
207  // loop through examples (columns)
208  // CASE DISTINCTION:
209  if(normalizer == NONE)
210  {
211  for(std::size_t j = 0; j < size; j++)
212  {
213  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << kernel.eval(points[i].input, points[j].input);
214  }
215  out << "\n";
216  }
217  else if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
218  {
219  for(std::size_t j = 0; j < size; j++)
220  {
221  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor * kernel.eval(points[i].input, points[j].input);
222  }
223  out << "\n";
224  }
225  else if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
226  {
227  for(std::size_t j = 0; j < size; j++)
228  {
229  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << variance_factor *kernel.eval(points[i].input, points[j].input);
230  }
231  out << "\n";
232  }
233  else if(normalizer == CENTER_ONLY)
234  {
235  for(std::size_t j = 0; j < size; j++)
236  {
237  double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
238  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << tmp;
239  }
240  out << "\n";
241  }
242  else if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
243  {
244  for(std::size_t j = 0; j < size; j++)
245  {
246  double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
247  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor *tmp;
248  }
249  out << "\n";
250  }
251  else
252  {
253  throw SHARKEXCEPTION("[detail::export_kernel_matrix] Unknown normalization type.");
254  }
255 
256  }
257 
258  // clean up
259  out.precision(ss);
260 }
261 
262 
263 
264 /// \brief Write a kernel Gram matrix to file.
265 ///
266 /// \param dataset data basis for the Gram matrix
267 /// \param kernel pointer to kernel function to be used
268 /// \param fn The filename of the file to be written to
269 /// \param normalizer what kind of normalization to apply. see enum declaration for details.
270 /// \param sci should the output be in scientific notation?
271 /// \param width field width for pretty printing
272 template<typename InputType, typename LabelType>
274  LabeledData<InputType, LabelType> const &dataset,
276  std::string fn,
277  KernelMatrixNormalizationType normalizer = NONE,
278  bool sci = false,
279  unsigned int width = 0
280 )
281 {
282  std::ofstream ofs(fn.c_str());
283  if(ofs)
284  {
285  exportKernelMatrix(dataset, kernel, ofs, normalizer, sci, width);
286  }
287  else
288  throw(std::invalid_argument("[detail::export_kernel_matrix] Stream cannot be opened for writing."));
289 
290 }
291 
292 
293 
294 
295 // deprecated wrapper
296 template<typename InputType, typename LabelType>
298  LabeledData<InputType, LabelType> const &dataset,
299  AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
300  std::ostream &out, // The stream to be written to
301  KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
302  bool scientific = false, // scientific notation?
303  unsigned int fieldwidth = 0 // for pretty-printing
304 )
305 {
306  exportKernelMatrix(dataset, kernel, out, normalizer, scientific, fieldwidth);
307 }
308 
309 
310 // deprecated wrapper
311 template<typename InputType, typename LabelType>
313  LabeledData<InputType, LabelType> const &dataset,
315  std::string fn,
316  KernelMatrixNormalizationType normalizer = NONE,
317  bool sci = false,
318  unsigned int width = 0
319 )
320 {
321  exportKernelMatrix(dataset, kernel, fn, normalizer, sci, width);
322 }
323 
324 
325 
326 // TODO: import functionality is still missing.
327 // when that is done, add tutorial
328 
329 
330 /** @}*/
331 
332 } // namespace shark
333 
334 
335 
336 #endif // SHARK_DATA_PRECOMPUTEDMATRIX_H