OpenMS  2.4.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
42 
43 #include <set>
44 
45 namespace OpenMS
46 {
47  class PeptideIdentification;
48 
68  class OPENMS_DLLAPI ProteinIdentification :
69  public MetaInfoInterface
70  {
71 public:
74 
78  class OPENMS_DLLAPI ProteinGroup
79  {
80  public:
83  typedef std::vector<FloatDataArray> FloatDataArrays;
86  typedef std::vector<StringDataArray> StringDataArrays;
89  typedef std::vector<IntegerDataArray> IntegerDataArrays;
90 
92  double probability;
93 
95  std::vector<String> accessions;
96 
97  ProteinGroup();
98 
100  bool operator==(const ProteinGroup& rhs) const;
101 
102  /*
103  @brief Comparison operator (for sorting)
104 
105  This operator is intended for sorting protein groups in a "best first"
106  manner. That means higher probabilities are "less" than lower
107  probabilities (!); smaller groups are "less" than larger groups;
108  everything else being equal, accessions are compared lexicographically.
109  */
110  bool operator<(const ProteinGroup& rhs) const;
111 
113 
124  const FloatDataArrays& getFloatDataArrays() const;
126 
129  {
130  return float_data_arrays_;
131  }
132 
134  void setFloatDataArrays(const FloatDataArrays& fda);
135 
137  const StringDataArrays& getStringDataArrays() const;
138 
140  StringDataArrays& getStringDataArrays();
141 
143  void setStringDataArrays(const StringDataArrays& sda);
144 
146  const IntegerDataArrays& getIntegerDataArrays() const;
147 
149  IntegerDataArrays& getIntegerDataArrays();
150 
152  void setIntegerDataArrays(const IntegerDataArrays& ida);
153 
156  {
157  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
158  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
159  }
160 
163  {
164  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
165  [&name](const StringDataArray& da) { return da.getName() == name; } );
166  }
167 
170  {
171  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
172  [&name](const FloatDataArray& da) { return da.getName() == name; } );
173  }
174 
177  {
178  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
179  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
180  }
181 
184  {
185  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
186  [&name](const StringDataArray& da) { return da.getName() == name; } );
187  }
188 
191  {
192  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
193  [&name](const FloatDataArray& da) { return da.getName() == name; } );
194  }
195 
196  private:
199 
202 
205  };
206 
209  {
212  SIZE_OF_PEAKMASSTYPE
213  };
215  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
216 
218  struct OPENMS_DLLAPI SearchParameters :
219  public MetaInfoInterface
220  {
226  std::vector<String> fixed_modifications;
227  std::vector<String> variable_modifications;
234 
237  SearchParameters(const SearchParameters &) = default;
239  SearchParameters(SearchParameters&&) = default;
241  ~SearchParameters() = default;
242 
244  SearchParameters & operator=(const SearchParameters &) = default;
246  SearchParameters& operator=(SearchParameters&&) & = default;
247 
248  bool operator==(const SearchParameters& rhs) const;
249 
250  bool operator!=(const SearchParameters& rhs) const;
251 
252  std::pair<int,int> getChargeRange() const;
253  int getChargeValue_(String& charge_str) const;
254 
255  };
256 
266  virtual ~ProteinIdentification();
267 
269  ProteinIdentification& operator=(const ProteinIdentification&) = default;
271  ProteinIdentification& operator=(ProteinIdentification&&) = default;
272 
274  bool operator==(const ProteinIdentification& rhs) const;
276  bool operator!=(const ProteinIdentification& rhs) const;
278 
280 
281  const std::vector<ProteinHit> & getHits() const;
284  std::vector<ProteinHit> & getHits();
286  void insertHit(const ProteinHit & input);
288  void insertHit(ProteinHit && input);
289 
295  void setHits(const std::vector<ProteinHit>& hits);
296 
298  std::vector<ProteinHit>::iterator findHit(const String& accession);
299 
301  const std::vector<ProteinGroup>& getProteinGroups() const;
303  std::vector<ProteinGroup>& getProteinGroups();
305  void insertProteinGroup(const ProteinGroup & group);
306 
308  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
310  std::vector<ProteinGroup>& getIndistinguishableProteins();
312  void insertIndistinguishableProteins(const ProteinGroup& group);
314  void fillIndistinguishableGroupsWithSingletons();
315 
317  double getSignificanceThreshold() const;
319  void setSignificanceThreshold(double value);
321  const String& getScoreType() const;
323  void setScoreType(const String& type);
325  bool isHigherScoreBetter() const;
327  void setHigherScoreBetter(bool higher_is_better);
329  void sort();
331  void assignRanks();
339  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
341 
348  void computeModifications(
349  const std::vector<PeptideIdentification>& pep_ids,
350  const StringList & skip_modifications);
351 
352 
354 
355  const DateTime& getDateTime() const;
358  void setDateTime(const DateTime& date);
360  void setSearchEngine(const String& search_engine);
362  const String& getSearchEngine() const;
364  void setSearchEngineVersion(const String& search_engine_version);
366  const String& getSearchEngineVersion() const;
368  void setSearchParameters(const SearchParameters& search_parameters);
370  const SearchParameters& getSearchParameters() const;
372  SearchParameters& getSearchParameters();
374  const String& getIdentifier() const;
376  void setIdentifier(const String& id);
378  void setPrimaryMSRunPath(const StringList& s);
380  void getPrimaryMSRunPath(StringList& toFill) const;
382  bool hasInferenceData() const;
384 
385 protected:
387 
394 
396 
399  std::vector<ProteinHit> protein_hits_;
400  std::vector<ProteinGroup> protein_groups_;
402  std::vector<ProteinGroup> indistinguishable_proteins_;
405  };
406 
407 } //namespace OpenMS
Representation of a protein identification run.
Definition: ProteinIdentification.h:68
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:83
String db
The used database.
Definition: ProteinIdentification.h:221
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:86
A more convenient string class.
Definition: String.h:58
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:204
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:162
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:226
String db_version
The database version.
Definition: ProteinIdentification.h:222
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:155
Definition: ProteinIdentification.h:211
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:229
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:183
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:198
String search_engine_version_
Definition: ProteinIdentification.h:390
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:231
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:128
bool operator==(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:806
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:82
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:73
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
Search parameters of the DB search.
Definition: ProteinIdentification.h:218
bool operator<(const MultiplexDeltaMasses &dm1, const MultiplexDeltaMasses &dm2)
Float data array class.
Definition: DataArrays.h:45
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:232
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:223
Integer data array class.
Definition: DataArrays.h:52
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:208
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:169
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:176
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:95
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:399
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:233
String protein_score_type_
Definition: ProteinIdentification.h:397
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:89
double protein_significance_threshold_
Definition: ProteinIdentification.h:403
String id_
Definition: ProteinIdentification.h:388
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:228
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:55
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:400
String search_engine_
Definition: ProteinIdentification.h:389
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:73
Representation of a protein hit.
Definition: ProteinHit.h:57
bool higher_score_better_
Definition: ProteinIdentification.h:398
DateTime date_
Definition: ProteinIdentification.h:392
Definition: ProteinIdentification.h:210
bool operator!=(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:824
DateTime Class.
Definition: DateTime.h:54
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name. ...
Definition: ProteinIdentification.h:190
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:230
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:402
String data array class.
Definition: DataArrays.h:59
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:225
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:78
SearchParameters search_parameters_
Definition: ProteinIdentification.h:391
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:85
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:88
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:224
double probability
Probability of this group.
Definition: ProteinIdentification.h:92
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:201
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:48
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:227