OpenMS  2.7.0
LibSVMEncoder.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Nico Pfeifer $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
40 #include <svm.h>
41 
42 #include <vector>
43 #include <utility>
44 
45 namespace OpenMS
46 {
55  class OPENMS_DLLAPI LibSVMEncoder
56  {
57 public:
62 
72  void encodeCompositionVector(const String & sequence, std::vector<std::pair<Int, double> > & encoded_vector, const String & allowed_characters = "ACDEFGHIKLMNPQRSTVWY");
73 
83  void encodeCompositionVectors(const std::vector<String> & sequences, const String & allowed_characters, std::vector<std::vector<std::pair<Int, double> > > & composition_vectors);
85  svm_node * encodeLibSVMVector(const std::vector<std::pair<Int, double> > & feature_vector);
86 
88  void encodeLibSVMVectors(const std::vector<std::vector<std::pair<Int, double> > > & feature_vectors, std::vector<svm_node *> & libsvm_vectors);
89 
91  svm_problem * encodeLibSVMProblem(const std::vector<svm_node *> & vectors,
92  std::vector<double> & labels);
93 
95  svm_problem * encodeLibSVMProblemWithCompositionVectors(const std::vector<String> & sequences,
96  std::vector<double> & labels,
97  const String & allowed_characters);
98 
104  svm_problem * encodeLibSVMProblemWithCompositionAndLengthVectors(const std::vector<String> & sequences,
105  std::vector<double> & labels,
106  const String & allowed_characters,
107  UInt maximum_sequence_length);
108 
114  svm_problem * encodeLibSVMProblemWithCompositionLengthAndWeightVectors(const std::vector<String> & sequences,
115  std::vector<double> & labels,
116  const String & allowed_characters);
117 
119  bool storeLibSVMProblem(const String & filename, const svm_problem * problem) const;
120 
122  svm_problem * loadLibSVMProblem(const String & filename);
123 
125  void encodeOligoBorders(String sequence,
126  UInt k_mer_length,
127  const String & allowed_characters,
128  UInt border_length,
129  std::vector<std::pair<Int, double> > & libsvm_vector,
130  bool strict = false,
131  bool unpaired = false,
132  bool length_encoding = false);
133 
135  svm_problem * encodeLibSVMProblemWithOligoBorderVectors(const std::vector<String> & sequences,
136  std::vector<double> & labels,
137  UInt k_mer_length,
138  const String & allowed_characters,
139  UInt border_length,
140  bool strict = false,
141  bool unpaired = false,
142  bool length_encoding = false);
143 
145  void encodeProblemWithOligoBorderVectors(const std::vector<AASequence> & sequences,
146  UInt k_mer_length,
147  const String & allowed_characters,
148  UInt border_length,
149  std::vector<std::vector<std::pair<Int, double> > > & vectors);
150 
157  void libSVMVectorToString(svm_node * vector, String & output);
158 
165  void libSVMVectorsToString(svm_problem * vector, String & output);
166 
173  void encodeOligo(const AASequence & sequence,
174  UInt k_mer_length,
175  const String & allowed_characters,
176  std::vector<std::pair<Int, double> > & values,
177  bool is_right_border = false);
178 
184  static void destroyProblem(svm_problem * problem);
185 
186  static std::vector<double> predictPeptideRT(const std::vector<String> & sequences,
187  SVMWrapper& svm,
188  const String & allowed_characters = "ACDEFGHIKLMNPQRSTVWY",
189  UInt maximum_sequence_length = 50)
190  {
191  std::vector<double> predicted_retention_times;
192 
193  LibSVMEncoder encoder;
194  std::vector<double> temp_rts;
195  temp_rts.resize(sequences.size(), 0);
196  svm_problem * prediction_data =
198  temp_rts,
199  allowed_characters,
200  maximum_sequence_length);
201  svm.predict(prediction_data, predicted_retention_times);
202  LibSVMEncoder::destroyProblem(prediction_data);
203  return predicted_retention_times;
204  }
205 
206 private:
208  static bool cmpOligos_(std::pair<Int, double> a,
209  std::pair<Int, double> b);
210 
211  };
212 
213 } // namespace OpenMS
214 
Representation of a peptide/protein sequence.
Definition: AASequence.h:112
Serves for encoding sequences into feature vectors.
Definition: LibSVMEncoder.h:56
svm_problem * loadLibSVMProblem(const String &filename)
loads the LibSVM-encoded data stored in 'filename'
void encodeOligoBorders(String sequence, UInt k_mer_length, const String &allowed_characters, UInt border_length, std::vector< std::pair< Int, double > > &libsvm_vector, bool strict=false, bool unpaired=false, bool length_encoding=false)
encodes the borders of the sequence as k_mer oligos and stores them in 'libsvm_vector'
~LibSVMEncoder()
Destructor.
void encodeCompositionVectors(const std::vector< String > &sequences, const String &allowed_characters, std::vector< std::vector< std::pair< Int, double > > > &composition_vectors)
stores composition vectors of the sequences given by 'sequence' in 'composition_vectors'
svm_problem * encodeLibSVMProblemWithCompositionVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters)
creates composition vectors for 'sequences' and stores them in LibSVM compliant format
svm_node * encodeLibSVMVector(const std::vector< std::pair< Int, double > > &feature_vector)
encodes the feature vector in LibSVM compliant format
svm_problem * encodeLibSVMProblemWithCompositionLengthAndWeightVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters)
creates composition vectors with additional length and average weight information for 'sequences' and...
svm_problem * encodeLibSVMProblemWithCompositionAndLengthVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters, UInt maximum_sequence_length)
creates composition vectors with additional length information for 'sequences' and stores them in Lib...
LibSVMEncoder()
Constructor.
static void destroyProblem(svm_problem *problem)
frees all the memory of the svm_problem instance
void libSVMVectorsToString(svm_problem *vector, String &output)
stores a string representation of the encoded sequences in 'vectors' in 'output'
svm_problem * encodeLibSVMProblemWithOligoBorderVectors(const std::vector< String > &sequences, std::vector< double > &labels, UInt k_mer_length, const String &allowed_characters, UInt border_length, bool strict=false, bool unpaired=false, bool length_encoding=false)
creates oligo border vectors vectors for 'sequences' and stores them in LibSVM compliant format
void encodeCompositionVector(const String &sequence, std::vector< std::pair< Int, double > > &encoded_vector, const String &allowed_characters="ACDEFGHIKLMNPQRSTVWY")
stores a composition vector of 'sequence' in 'encoded_vector'
static std::vector< double > predictPeptideRT(const std::vector< String > &sequences, SVMWrapper &svm, const String &allowed_characters="ACDEFGHIKLMNPQRSTVWY", UInt maximum_sequence_length=50)
Definition: LibSVMEncoder.h:186
bool storeLibSVMProblem(const String &filename, const svm_problem *problem) const
stores the LibSVM-encoded data in a text file that can be used by the LibSVM applications (svm-scale,...
void encodeOligo(const AASequence &sequence, UInt k_mer_length, const String &allowed_characters, std::vector< std::pair< Int, double > > &values, bool is_right_border=false)
encodes an AASequence instance in oligo encoding
svm_problem * encodeLibSVMProblem(const std::vector< svm_node * > &vectors, std::vector< double > &labels)
encodes the LibSVM compliant vectors into a LibSVM compliant structure
static bool cmpOligos_(std::pair< Int, double > a, std::pair< Int, double > b)
comparator for oligos encoded by encodeOligo
void encodeLibSVMVectors(const std::vector< std::vector< std::pair< Int, double > > > &feature_vectors, std::vector< svm_node * > &libsvm_vectors)
encodes the feature vectors in LibSVM compliant format
void libSVMVectorToString(svm_node *vector, String &output)
stores a string representation of the encoded sequence 'vector' in 'output'
void encodeProblemWithOligoBorderVectors(const std::vector< AASequence > &sequences, UInt k_mer_length, const String &allowed_characters, UInt border_length, std::vector< std::vector< std::pair< Int, double > > > &vectors)
creates oligo border vectors vectors for 'sequences' and stores them in 'vectors'
Serves as a wrapper for the libsvm.
Definition: SVMWrapper.h:82
void predict(struct svm_problem *problem, std::vector< double > &predicted_labels)
predicts the labels using the trained model
A more convenient string class.
Definition: String.h:61
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47