OpenMS  2.7.0
SVMWrapper.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <svm.h>
38 
39 #include <OpenMS/CONCEPT/Types.h>
42 #include <OpenMS/FORMAT/TextFile.h>
43 #include <OpenMS/SYSTEM/File.h>
45 
46 #include <string>
47 #include <vector>
48 #include <map>
49 #include <cmath>
50 
51 namespace OpenMS
52 {
53 
55  struct OPENMS_DLLAPI SVMData
56  {
57  std::vector<std::vector<std::pair<Int, double> > > sequences;
58  std::vector<double> labels;
59 
61 
62  SVMData(std::vector<std::vector<std::pair<Int, double> > >& seqs, std::vector<double>& lbls);
63 
64  bool operator==(const SVMData& rhs) const;
65 
66  bool store(const String& filename) const;
67 
68  bool load(const String& filename);
69 
70  };
71 
80  class OPENMS_DLLAPI SVMWrapper :
81  public ProgressLogger
82  {
83 public:
84 
92  {
96  C,
97  NU,
98  P,
102  BORDER_LENGTH
103  };
104 
107  {
108  OLIGO = 19,
109  OLIGO_COMBINED
110  };
111 
114 
116  virtual ~SVMWrapper();
117 
160 
167  void setParameter(SVM_parameter_type type, double value);
168 
174  Int train(struct svm_problem* problem);
175 
181  Int train(SVMData& problem);
182 
193  void saveModel(std::string modelFilename) const;
194 
203  void loadModel(std::string modelFilename);
204 
210  void predict(struct svm_problem* problem, std::vector<double>& predicted_labels);
211 
217  void predict(const SVMData& problem, std::vector<double>& results);
218 
259 
288 
294  void createRandomPartitions(svm_problem* problem, Size number, std::vector<svm_problem*>& partitions);
295 
301  void createRandomPartitions(const SVMData& problem,
302  Size number,
303  std::vector<SVMData>& problems);
307  static svm_problem* mergePartitions(const std::vector<svm_problem*>& problems, Size except);
308 
312  static void mergePartitions(const std::vector<SVMData>& problems,
313  Size except,
314  SVMData& merged_problem);
315 
322  void predict(const std::vector<svm_node*>& vectors, std::vector<double>& predicted_rts);
323 
328  static void getLabels(svm_problem* problem, std::vector<double>& labels);
329 
334  double performCrossValidation(svm_problem* problem_ul,
335  const SVMData& problem_l,
336  const bool is_labeled,
337  const std::map<SVM_parameter_type, double>& start_values_map,
338  const std::map<SVM_parameter_type, double>& step_sizes_map,
339  const std::map<SVM_parameter_type, double>& end_values_map,
340  Size number_of_partitions,
341  Size number_of_runs,
342  std::map<SVM_parameter_type, double>& best_parameters,
343  bool additive_step_sizes = true,
344  bool output = false,
345  String performances_file_name = "performances.txt",
346  bool mcc_as_performance_measure = false);
347 
348 
359 
375  static double kernelOligo(const std::vector<std::pair<int, double> >& x,
376  const std::vector<std::pair<int, double> >& y,
377  const std::vector<double>& gauss_table,
378  int max_distance = -1);
379 
387  static double kernelOligo(const svm_node* x, const svm_node* y, const std::vector<double>& gauss_table, double sigma_square = 0, Size max_distance = 50);
388 
392  void getSignificanceBorders(svm_problem* data, std::pair<double, double>& borders, double confidence = 0.95, Size number_of_runs = 5, Size number_of_partitions = 5, double step_size = 0.01, Size max_iterations = 1000000);
393 
397  void getSignificanceBorders(const SVMData& data,
398  std::pair<double, double>& sigmas,
399  double confidence = 0.95,
400  Size number_of_runs = 5,
401  Size number_of_partitions = 5,
402  double step_size = 0.01,
403  Size max_iterations = 1000000);
404 
411  double getPValue(double sigma1, double sigma2, std::pair<double, double> point);
412 
422  void getDecisionValues(svm_problem* data, std::vector<double>& decision_values);
423 
430  void scaleData(svm_problem* data, Int max_scale_value = -1);
431 
432  static void calculateGaussTable(Size border_length, double sigma, std::vector<double>& gauss_table);
433 
441  svm_problem* computeKernelMatrix(svm_problem* problem1, svm_problem* problem2);
442 
450  svm_problem* computeKernelMatrix(const SVMData& problem1, const SVMData& problem2);
451 
456  void setTrainingSample(svm_problem* training_sample);
457 
461  void setTrainingSample(SVMData& training_sample);
462 
472  void getSVCProbabilities(struct svm_problem* problem, std::vector<double>& probabilities, std::vector<double>& prediction_labels);
473 
477  void setWeights(const std::vector<Int>& weight_labels, const std::vector<double>& weights);
478 
479 private:
486  bool nextGrid_(const std::vector<double>& start_values,
487  const std::vector<double>& step_sizes,
488  const std::vector<double>& end_values,
489  const bool additive_step_sizes,
490  std::vector<double>& actual_values);
491 
492  Size getNumberOfEnclosedPoints_(double m1, double m2, const std::vector<std::pair<double, double> >& points);
493 
498 
504  static void printToVoid_(const char* /*s*/);
505 
506  svm_parameter* param_;
507  svm_model* model_;
508  double sigma_;
509  std::vector<double> sigmas_;
510  std::vector<double> gauss_table_;
511  std::vector<std::vector<double> > gauss_tables_;
514  svm_problem* training_set_ = nullptr;
515  svm_problem* training_problem_ = nullptr;
518  };
519 
520 } // namespace OpenMS
521 
Definition: MathFunctions.h:352
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:55
Serves as a wrapper for the libsvm.
Definition: SVMWrapper.h:82
static void calculateGaussTable(Size border_length, double sigma, std::vector< double > &gauss_table)
void setWeights(const std::vector< Int > &weight_labels, const std::vector< double > &weights)
Sets weights for the classes in C_SVC (see libsvm documentation for further details)
void saveModel(std::string modelFilename) const
saves the svm model
void setTrainingSample(SVMData &training_sample)
This is used for being able to perform predictions with non libsvm standard kernels.
svm_problem * computeKernelMatrix(const SVMData &problem1, const SVMData &problem2)
computes the kernel matrix using the actual svm parameters and the given data
static void printToVoid_(const char *)
This function is passed to lib svm for output control.
svm_problem * computeKernelMatrix(svm_problem *problem1, svm_problem *problem2)
computes the kernel matrix using the actual svm parameters and the given data
Size border_length_
the actual kernel type
Definition: SVMWrapper.h:513
static double kernelOligo(const svm_node *x, const svm_node *y, const std::vector< double > &gauss_table, double sigma_square=0, Size max_distance=50)
calculates the oligo kernel value for the encoded sequences 'x' and 'y'
void createRandomPartitions(const SVMData &problem, Size number, std::vector< SVMData > &problems)
You can create 'number' equally sized random partitions.
static svm_problem * mergePartitions(const std::vector< svm_problem * > &problems, Size except)
You can merge partitions excluding the partition with index 'except'.
static void getLabels(svm_problem *problem, std::vector< double > &labels)
Stores the stored labels of the encoded SVM data at 'labels'.
void setParameter(SVM_parameter_type type, double value)
sets the double parameters of the svm
std::vector< double > sigmas_
for the combined oligo kernel (amount of positional smearing)
Definition: SVMWrapper.h:509
Int train(struct svm_problem *problem)
trains the svm
double sigma_
for the oligo kernel (amount of positional smearing)
Definition: SVMWrapper.h:508
SVM_kernel_type
Kernel type.
Definition: SVMWrapper.h:107
Math::RandomShuffler shuffler_
random shuffler to create training partitions
Definition: SVMWrapper.h:517
bool nextGrid_(const std::vector< double > &start_values, const std::vector< double > &step_sizes, const std::vector< double > &end_values, const bool additive_step_sizes, std::vector< double > &actual_values)
find next grid search parameter combination
void createRandomPartitions(svm_problem *problem, Size number, std::vector< svm_problem * > &partitions)
You can create 'number' equally sized random partitions.
std::vector< std::vector< double > > gauss_tables_
lookup table for fast computation of the combined oligo kernel
Definition: SVMWrapper.h:511
Size kernel_type_
the actual kernel type
Definition: SVMWrapper.h:512
double getPValue(double sigma1, double sigma2, std::pair< double, double > point)
calculates a p-value for a given data point using the model parameters
SVM_parameter_type
Parameters for the svm to be set from outside.
Definition: SVMWrapper.h:92
@ GAMMA
the gamma parameter of the POLY, RBF and SIGMOID kernel
Definition: SVMWrapper.h:99
@ SVM_TYPE
the svm type cab be NU_SVR or EPSILON_SVR
Definition: SVMWrapper.h:93
@ C
the C parameter of the svm
Definition: SVMWrapper.h:96
@ PROBABILITY
Definition: SVMWrapper.h:100
@ P
the epsilon parameter for epsilon-SVR
Definition: SVMWrapper.h:98
@ NU
the nu parameter for nu-SVR
Definition: SVMWrapper.h:97
@ DEGREE
the degree for the polynomial- kernel
Definition: SVMWrapper.h:95
@ KERNEL_TYPE
the kernel type
Definition: SVMWrapper.h:94
@ SIGMA
Definition: SVMWrapper.h:101
void loadModel(std::string modelFilename)
loads the model
Size getNumberOfEnclosedPoints_(double m1, double m2, const std::vector< std::pair< double, double > > &points)
double getSVRProbability()
Returns the probability parameter sigma of the fitted Laplace model.
virtual ~SVMWrapper()
destructor
void getDecisionValues(svm_problem *data, std::vector< double > &decision_values)
stores the prediction values for the encoded data in 'decision_values'
Int train(SVMData &problem)
trains the svm
void predict(const SVMData &problem, std::vector< double > &results)
predicts the labels using the trained model
static void mergePartitions(const std::vector< SVMData > &problems, Size except, SVMData &merged_problem)
You can merge partitions excluding the partition with index 'except'.
std::vector< double > gauss_table_
lookup table for fast computation of the oligo kernel
Definition: SVMWrapper.h:510
svm_model * model_
the learned svm discriminant
Definition: SVMWrapper.h:507
void scaleData(svm_problem *data, Int max_scale_value=-1)
Scales the data such that every column is scaled to [-1, 1].
void setTrainingSample(svm_problem *training_sample)
This is used for being able to perform predictions with non libsvm standard kernels.
void setParameter(SVM_parameter_type type, Int value)
You can set the parameters of the svm:
Int getIntParameter(SVM_parameter_type type)
You can get the actual int- parameters of the svm.
void initParameters_()
Initializes the svm with standard parameters.
void predict(const std::vector< svm_node * > &vectors, std::vector< double > &predicted_rts)
predicts the labels using the trained model
void predict(struct svm_problem *problem, std::vector< double > &predicted_labels)
predicts the labels using the trained model
double performCrossValidation(svm_problem *problem_ul, const SVMData &problem_l, const bool is_labeled, const std::map< SVM_parameter_type, double > &start_values_map, const std::map< SVM_parameter_type, double > &step_sizes_map, const std::map< SVM_parameter_type, double > &end_values_map, Size number_of_partitions, Size number_of_runs, std::map< SVM_parameter_type, double > &best_parameters, bool additive_step_sizes=true, bool output=false, String performances_file_name="performances.txt", bool mcc_as_performance_measure=false)
Performs a CV for the data given by 'problem'.
void getSignificanceBorders(svm_problem *data, std::pair< double, double > &borders, double confidence=0.95, Size number_of_runs=5, Size number_of_partitions=5, double step_size=0.01, Size max_iterations=1000000)
calculates the significance borders of the error model and stores them in 'sigmas'
double getDoubleParameter(SVM_parameter_type type)
You can get the actual double- parameters of the svm.
SVMData training_data_
the training set (different encoding)
Definition: SVMWrapper.h:516
void getSVCProbabilities(struct svm_problem *problem, std::vector< double > &probabilities, std::vector< double > &prediction_labels)
This function fills probabilities with the probability estimates for the first class.
static double kernelOligo(const std::vector< std::pair< int, double > > &x, const std::vector< std::pair< int, double > > &y, const std::vector< double > &gauss_table, int max_distance=-1)
returns the value of the oligo kernel for sequences 'x' and 'y'
svm_parameter * param_
the parameters for the svm
Definition: SVMWrapper.h:506
void getSignificanceBorders(const SVMData &data, std::pair< double, double > &sigmas, double confidence=0.95, Size number_of_runs=5, Size number_of_partitions=5, double step_size=0.01, Size max_iterations=1000000)
calculates the significance borders of the error model and stores them in 'sigmas'
SVMWrapper()
standard constructor
A more convenient string class.
Definition: String.h:61
int Int
Signed integer type.
Definition: Types.h:102
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Data structure used in SVMWrapper.
Definition: SVMWrapper.h:56
std::vector< double > labels
Definition: SVMWrapper.h:58
bool store(const String &filename) const
bool operator==(const SVMData &rhs) const
bool load(const String &filename)
std::vector< std::vector< std::pair< Int, double > > > sequences
Definition: SVMWrapper.h:57
SVMData(std::vector< std::vector< std::pair< Int, double > > > &seqs, std::vector< double > &lbls)