OpenMS  3.0.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <set>
45 
46 namespace OpenMS
47 {
48  class MSExperiment;
49  class PeptideIdentification;
50  class PeptideEvidence;
51  class ConsensusMap;
52 
72  class OPENMS_DLLAPI ProteinIdentification :
73  public MetaInfoInterface
74  {
75 public:
78 
80  struct Mapping
81  {
82  std::map<String, StringList> identifier_to_msrunpath;
83  std::map<StringList, String> runpath_to_identifier;
84 
85  Mapping() = default;
86 
87  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
88  {
89  create(prot_ids);
90  }
91  void create(const std::vector<ProteinIdentification>& prot_ids)
92  {
93  identifier_to_msrunpath.clear();
94  runpath_to_identifier.clear();
95  StringList filenames;
96  for (const ProteinIdentification& prot_id : prot_ids)
97  {
98  prot_id.getPrimaryMSRunPath(filenames);
99  if (filenames.empty())
100  {
101  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
102  }
103  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
104  const auto& it = runpath_to_identifier.find(filenames);
105  if (it != runpath_to_identifier.end())
106  {
107  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
108  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
109  ListUtils::concatenate(filenames, ","));
110  }
111  runpath_to_identifier[filenames] = prot_id.getIdentifier();
112  }
113  }
114  };
115 
119  class OPENMS_DLLAPI ProteinGroup
120  {
121  public:
124  typedef std::vector<FloatDataArray> FloatDataArrays;
127  typedef std::vector<StringDataArray> StringDataArrays;
130  typedef std::vector<IntegerDataArray> IntegerDataArrays;
131 
133  double probability;
134 
136  std::vector<String> accessions;
137 
138  ProteinGroup();
139 
141  bool operator==(const ProteinGroup& rhs) const;
142 
143  /*
144  @brief Comparison operator (for sorting)
145 
146  This operator is intended for sorting protein groups in a "best first"
147  manner. That means higher probabilities are "less" than lower
148  probabilities (!); smaller groups are "less" than larger groups;
149  everything else being equal, accessions are compared lexicographically.
150  */
151  bool operator<(const ProteinGroup& rhs) const;
152 
154 
165  const FloatDataArrays& getFloatDataArrays() const;
167 
170  {
171  return float_data_arrays_;
172  }
173 
175  void setFloatDataArrays(const FloatDataArrays& fda);
176 
178  const StringDataArrays& getStringDataArrays() const;
179 
181  StringDataArrays& getStringDataArrays();
182 
184  void setStringDataArrays(const StringDataArrays& sda);
185 
187  const IntegerDataArrays& getIntegerDataArrays() const;
188 
190  IntegerDataArrays& getIntegerDataArrays();
191 
193  void setIntegerDataArrays(const IntegerDataArrays& ida);
194 
197  {
198  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
199  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
200  }
201 
204  {
205  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
206  [&name](const StringDataArray& da) { return da.getName() == name; } );
207  }
208 
211  {
212  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
213  [&name](const FloatDataArray& da) { return da.getName() == name; } );
214  }
215 
218  {
219  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
220  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
221  }
222 
225  {
226  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
227  [&name](const StringDataArray& da) { return da.getName() == name; } );
228  }
229 
232  {
233  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
234  [&name](const FloatDataArray& da) { return da.getName() == name; } );
235  }
236 
237  private:
240 
243 
246  };
247 
250  {
253  SIZE_OF_PEAKMASSTYPE
254  };
255 
257  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
258 
260  struct OPENMS_DLLAPI SearchParameters :
261  public MetaInfoInterface
262  {
268  std::vector<String> fixed_modifications;
269  std::vector<String> variable_modifications;
277 
280  SearchParameters(const SearchParameters&) = default;
282  SearchParameters(SearchParameters&&) = default;
284  ~SearchParameters() = default;
285 
287  SearchParameters& operator=(const SearchParameters&) = default;
289  SearchParameters& operator=(SearchParameters&&)& = default;
290 
291  bool operator==(const SearchParameters& rhs) const;
292 
293  bool operator!=(const SearchParameters& rhs) const;
294 
296  std::pair<int,int> getChargeRange() const;
297 
302  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
303 
304  private:
305  int getChargeValue_(String& charge_str) const;
306  };
307 
317  virtual ~ProteinIdentification();
318 
320  ProteinIdentification& operator=(const ProteinIdentification&) = default;
322  ProteinIdentification& operator=(ProteinIdentification&&) = default;
323 
325  bool operator==(const ProteinIdentification& rhs) const;
327  bool operator!=(const ProteinIdentification& rhs) const;
329 
331 
332  const std::vector<ProteinHit>& getHits() const;
335  std::vector<ProteinHit>& getHits();
337  void insertHit(const ProteinHit& input);
339  void insertHit(ProteinHit&& input);
340 
346  void setHits(const std::vector<ProteinHit>& hits);
347 
349  std::vector<ProteinHit>::iterator findHit(const String& accession);
350 
352  const std::vector<ProteinGroup>& getProteinGroups() const;
354  std::vector<ProteinGroup>& getProteinGroups();
356  void insertProteinGroup(const ProteinGroup& group);
357 
359  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
361  std::vector<ProteinGroup>& getIndistinguishableProteins();
363  void insertIndistinguishableProteins(const ProteinGroup& group);
365  void fillIndistinguishableGroupsWithSingletons();
366 
368  double getSignificanceThreshold() const;
370  void setSignificanceThreshold(double value);
372  const String& getScoreType() const;
374  void setScoreType(const String& type);
376  bool isHigherScoreBetter() const;
378  void setHigherScoreBetter(bool higher_is_better);
380  void sort();
382  void assignRanks();
390  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
391  void computeCoverage(const ConsensusMap& cmap, bool use_unassigned_ids);
393 
400  void computeModifications(
401  const std::vector<PeptideIdentification>& pep_ids,
402  const StringList& skip_modifications);
403  void computeModifications(
404  const ConsensusMap& cmap,
405  const StringList& skip_modifications,
406  bool use_unassigned_ids);
407 
408 
410 
411  const DateTime& getDateTime() const;
414  void setDateTime(const DateTime& date);
416  void setSearchEngine(const String& search_engine);
418  const String& getSearchEngine() const;
420  const String getOriginalSearchEngineName() const;
422  void setSearchEngineVersion(const String& search_engine_version);
424  const String& getSearchEngineVersion() const;
426  void setInferenceEngine(const String& search_engine);
428  const String getInferenceEngine() const;
430  void setInferenceEngineVersion(const String& inference_engine_version);
432  const String getInferenceEngineVersion() const;
434  void setSearchParameters(const SearchParameters& search_parameters);
436  void setSearchParameters(SearchParameters&& search_parameters);
438  const SearchParameters& getSearchParameters() const;
440  SearchParameters& getSearchParameters();
442  const String& getIdentifier() const;
444  void setIdentifier(const String& id);
450  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
451 
453  void setPrimaryMSRunPath(const StringList& s, MSExperiment& e);
454  void addPrimaryMSRunPath(const String& s, bool raw = false);
455  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
456 
462  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
463 
465  Size nrPrimaryMSRunPaths(bool raw = false) const;
466 
469  bool hasInferenceData() const;
470 
472  bool hasInferenceEngineAsSearchEngine() const;
473 
477  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
478 
481  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
482 
484 
485 protected:
487 
494 
496 
499  std::vector<ProteinHit> protein_hits_;
500  std::vector<ProteinGroup> protein_groups_;
502  std::vector<ProteinGroup> indistinguishable_proteins_;
505 
506  private:
507  void computeCoverageFromEvidenceMapping_(const std::unordered_map<String, std::set<PeptideEvidence>>& map);
508  void fillEvidenceMapping_(std::unordered_map<String, std::set<PeptideEvidence> >& map_acc_2_evidence,
509  const std::vector<PeptideIdentification>& pep_ids) const;
510 
511  void fillModMapping_(const std::vector<PeptideIdentification>& pep_ids, const StringList& skip_modifications,
512  std::unordered_map<String, std::set<std::pair<Size, ResidueModification>>>& prot2mod) const;
513  };
514 
515 } //namespace OpenMS
Representation of a protein identification run.
Definition: ProteinIdentification.h:72
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:124
String db
The used database.
Definition: ProteinIdentification.h:263
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:127
A more convenient string class.
Definition: String.h:58
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:245
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:203
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:268
String db_version
The database version.
Definition: ProteinIdentification.h:264
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:82
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:196
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:91
Definition: ProteinIdentification.h:252
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:271
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:224
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:239
String search_engine_version_
Definition: ProteinIdentification.h:490
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:273
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:169
A container for consensus elements.
Definition: ConsensusMap.h:82
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:83
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:123
bool operator==(const IDBoostGraph::ProteinGroup &lhs, const IDBoostGraph::ProteinGroup &rhs)
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:77
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:87
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Search parameters of the DB search.
Definition: ProteinIdentification.h:260
bool operator<(const MultiplexDeltaMasses &dm1, const MultiplexDeltaMasses &dm2)
Float data array class.
Definition: DataArrays.h:45
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:274
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:265
Integer data array class.
Definition: DataArrays.h:52
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:249
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:210
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:217
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:136
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:67
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:499
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:209
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:275
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1, or full=2)
Definition: ProteinIdentification.h:276
String protein_score_type_
Definition: ProteinIdentification.h:497
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:130
double protein_significance_threshold_
Definition: ProteinIdentification.h:503
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:80
String id_
Definition: ProteinIdentification.h:488
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:270
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:60
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:500
String search_engine_
Definition: ProteinIdentification.h:489
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
Representation of a protein hit.
Definition: ProteinHit.h:58
Invalid value exception.
Definition: Exception.h:327
bool higher_score_better_
Definition: ProteinIdentification.h:498
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:70
DateTime date_
Definition: ProteinIdentification.h:492
Definition: ProteinIdentification.h:251
bool operator!=(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:824
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
DateTime Class.
Definition: DateTime.h:58
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name. ...
Definition: ProteinIdentification.h:231
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:272
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:502
String data array class.
Definition: DataArrays.h:59
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:267
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:119
SearchParameters search_parameters_
Definition: ProteinIdentification.h:491
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:126
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:129
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:266
double probability
Probability of this group.
Definition: ProteinIdentification.h:133
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:242
Not all required information provided.
Definition: Exception.h:186
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:48
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:269