OpenMS
ProteinIdentification.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: Nico Pfeifer, Chris Bielow $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
21 
22 #include <set>
23 
24 namespace OpenMS
25 {
26  class MSExperiment;
27  class PeptideIdentification;
28  class PeptideEvidence;
29  class ConsensusMap;
30 
50  class OPENMS_DLLAPI ProteinIdentification :
51  public MetaInfoInterface
52  {
53 public:
56 
58  struct Mapping
59  {
60  std::map<String, StringList> identifier_to_msrunpath;
61  std::map<StringList, String> runpath_to_identifier;
62 
63  Mapping() = default;
64 
65  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
66  {
67  create(prot_ids);
68  }
69 
70  void create(const std::vector<ProteinIdentification>& prot_ids)
71  {
72  identifier_to_msrunpath.clear();
73  runpath_to_identifier.clear();
74  StringList filenames;
75  for (const ProteinIdentification& prot_id : prot_ids)
76  {
77  prot_id.getPrimaryMSRunPath(filenames);
78  if (filenames.empty())
79  {
80  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
81  }
82  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
83  const auto& it = runpath_to_identifier.find(filenames);
84  if (it != runpath_to_identifier.end())
85  {
86  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
87  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
88  ListUtils::concatenate(filenames, ","));
89  }
90  runpath_to_identifier[filenames] = prot_id.getIdentifier();
91  }
92  }
93 
95  {
96  // if a merge index n is annotated, we use the filename annotated at index n in the protein identification, otherwise the one at index 0
97  size_t merge_index = pepid.getMetaValue(Constants::UserParam::ID_MERGE_INDEX, 0);
98  const auto& filenames = identifier_to_msrunpath.at(pepid.getIdentifier());
99  return (merge_index < filenames.size()) ? filenames[merge_index] : ""; // return filename or empty string if missing
100  }
101  };
102 
106  class OPENMS_DLLAPI ProteinGroup
107  {
108  public:
111  typedef std::vector<FloatDataArray> FloatDataArrays;
114  typedef std::vector<StringDataArray> StringDataArrays;
117  typedef std::vector<IntegerDataArray> IntegerDataArrays;
118 
120  double probability;
121 
123  std::vector<String> accessions;
124 
126 
128  bool operator==(const ProteinGroup& rhs) const;
129 
130  /*
131  @brief Comparison operator (for sorting)
132 
133  This operator is intended for sorting protein groups in a "best first"
134  manner. That means higher probabilities are "less" than lower
135  probabilities (!); smaller groups are "less" than larger groups;
136  everything else being equal, accessions are compared lexicographically.
137  */
138  bool operator<(const ProteinGroup& rhs) const;
139 
141 
154 
157  {
158  return float_data_arrays_;
159  }
160 
163 
166 
169 
172 
175 
178 
181 
184  {
185  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
186  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
187  }
188 
191  {
192  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
193  [&name](const StringDataArray& da) { return da.getName() == name; } );
194  }
195 
198  {
199  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
200  [&name](const FloatDataArray& da) { return da.getName() == name; } );
201  }
202 
205  {
206  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
207  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
208  }
209 
212  {
213  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
214  [&name](const StringDataArray& da) { return da.getName() == name; } );
215  }
216 
219  {
220  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
221  [&name](const FloatDataArray& da) { return da.getName() == name; } );
222  }
223 
224  private:
227 
230 
233  };
234 
237  {
240  SIZE_OF_PEAKMASSTYPE
241  };
242 
244  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
245 
248 
250  struct OPENMS_DLLAPI SearchParameters :
251  public MetaInfoInterface
252  {
258  std::vector<String> fixed_modifications;
259  std::vector<String> variable_modifications;
267 
274  ~SearchParameters() = default;
275 
280 
281  bool operator==(const SearchParameters& rhs) const;
282 
283  bool operator!=(const SearchParameters& rhs) const;
284 
286  std::pair<int,int> getChargeRange() const;
287 
292  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
293 
294  private:
295  int getChargeValue_(String& charge_str) const;
296  };
297 
308 
313 
315  bool operator==(const ProteinIdentification& rhs) const;
317  bool operator!=(const ProteinIdentification& rhs) const;
319 
321 
322  const std::vector<ProteinHit>& getHits() const;
325  std::vector<ProteinHit>& getHits();
327  void insertHit(const ProteinHit& input);
329  void insertHit(ProteinHit&& input);
330 
336  void setHits(const std::vector<ProteinHit>& hits);
337 
339  std::vector<ProteinHit>::iterator findHit(const String& accession);
340 
342  const std::vector<ProteinGroup>& getProteinGroups() const;
344  std::vector<ProteinGroup>& getProteinGroups();
346  void insertProteinGroup(const ProteinGroup& group);
347 
349  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
351  std::vector<ProteinGroup>& getIndistinguishableProteins();
356 
358  double getSignificanceThreshold() const;
360  void setSignificanceThreshold(double value);
362  const String& getScoreType() const;
364  void setScoreType(const String& type);
366  bool isHigherScoreBetter() const;
368  void setHigherScoreBetter(bool higher_is_better);
370  void sort();
371 
380  void computeCoverage(const ConsensusMap& cmap, bool use_unassigned_ids);
382 
390  const PeptideIdentificationList& pep_ids,
391  const StringList& skip_modifications);
393  const ConsensusMap& cmap,
394  const StringList& skip_modifications,
395  bool use_unassigned_ids);
396 
397 
399 
400  const DateTime& getDateTime() const;
403  void setDateTime(const DateTime& date);
405  void setSearchEngine(const String& search_engine);
407  const String& getSearchEngine() const;
411  void setSearchEngineVersion(const String& search_engine_version);
415  void setInferenceEngine(const String& search_engine);
417  const String getInferenceEngine() const;
419  void setInferenceEngineVersion(const String& inference_engine_version);
423  void setSearchParameters(const SearchParameters& search_parameters);
425  void setSearchParameters(SearchParameters&& search_parameters);
431  const String& getIdentifier() const;
433  void setIdentifier(const String& id);
440  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
441 
444  void addPrimaryMSRunPath(const String& s, bool raw = false);
445  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
446 
453  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
454 
456  Size nrPrimaryMSRunPaths(bool raw = false) const;
457 
460  bool hasInferenceData() const;
461 
464 
468  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
469 
472  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
473 
475 
478 protected:
480 
487 
489 
492  std::vector<ProteinHit> protein_hits_;
493  std::vector<ProteinGroup> protein_groups_;
495  std::vector<ProteinGroup> indistinguishable_proteins_;
498 
499  private:
500  void computeCoverageFromEvidenceMapping_(const std::unordered_map<String, std::set<PeptideEvidence>>& map);
501  void fillEvidenceMapping_(std::unordered_map<String, std::set<PeptideEvidence> >& map_acc_2_evidence,
502  const PeptideIdentificationList& pep_ids) const;
503 
504  void fillModMapping_(const PeptideIdentificationList& pep_ids, const StringList& skip_modifications,
505  std::unordered_map<String, std::set<std::pair<Size, ResidueModification>>>& prot2mod) const;
506  };
507 
508 
509 } //namespace OpenMS
A container for consensus elements.
Definition: ConsensusMap.h:68
Float data array class.
Definition: DataArrays.h:25
Integer data array class.
Definition: DataArrays.h:75
String data array class.
Definition: DataArrays.h:125
DateTime Class.
Definition: DateTime.h:33
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:24
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:42
Invalid value exception.
Definition: Exception.h:305
Not all required information provided.
Definition: Exception.h:155
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:184
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:49
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:35
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Container for peptide identifications from multiple spectra.
Definition: PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition: PeptideIdentification.h:63
const String & getIdentifier() const
Returns the identifier which links this PI to its corresponding ProteinIdentification.
Representation of a protein hit.
Definition: ProteinHit.h:34
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:107
void setIntegerDataArrays(const IntegerDataArrays &ida)
Sets the integer meta data arrays.
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:156
double probability
Probability of this group.
Definition: ProteinIdentification.h:120
bool operator<(const ProteinGroup &rhs) const
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:183
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:114
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:110
StringDataArrays & getStringDataArrays()
Returns a mutable reference to the string meta data arrays.
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:204
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:123
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:197
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:229
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:190
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:211
const IntegerDataArrays & getIntegerDataArrays() const
Returns a const reference to the integer meta data arrays.
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:232
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:113
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:226
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:218
const FloatDataArrays & getFloatDataArrays() const
Returns a const reference to the float meta data arrays.
void setStringDataArrays(const StringDataArrays &sda)
Sets the string meta data arrays.
const StringDataArrays & getStringDataArrays() const
Returns a const reference to the string meta data arrays.
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:111
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:116
bool operator==(const ProteinGroup &rhs) const
Equality operator.
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:117
IntegerDataArrays & getIntegerDataArrays()
Returns a mutable reference to the integer meta data arrays.
void setFloatDataArrays(const FloatDataArrays &fda)
Sets the float meta data arrays.
Representation of a protein identification run.
Definition: ProteinIdentification.h:52
void setIdentifier(const String &id)
Sets the identifier.
ProteinIdentification(const ProteinIdentification &)=default
Copy constructor.
const String & getIdentifier() const
Returns the identifier.
void insertProteinGroup(const ProteinGroup &group)
Appends a new protein group.
void insertHit(const ProteinHit &input)
Appends a protein hit.
const String getInferenceEngineVersion() const
Returns the search engine version.
void setSearchEngine(const String &search_engine)
Sets the search engine type.
void addPrimaryMSRunPath(const String &s, bool raw=false)
SearchParameters & getSearchParameters()
Returns the search parameters (mutable)
const String getOriginalSearchEngineName() const
Return the type of search engine that was first applied (e.g., before percolator or consensusID) or "...
bool hasInferenceEngineAsSearchEngine() const
Checks if the search engine name matches an inference engine known to OpenMS.
void computeCoverageFromEvidenceMapping_(const std::unordered_map< String, std::set< PeptideEvidence >> &map)
void setHigherScoreBetter(bool higher_is_better)
Sets the orientation of the score (is higher better?)
const std::vector< ProteinGroup > & getIndistinguishableProteins() const
Returns the indistinguishable proteins.
Size nrPrimaryMSRunPaths(bool raw=false) const
get the number of primary MS runs involve in this ID run
const String & getSearchEngineVersion() const
Returns the search engine version.
double getSignificanceThreshold() const
Returns the protein significance threshold value.
const String getInferenceEngine() const
Returns the type of search engine used.
void computeModifications(const PeptideIdentificationList &pep_ids, const StringList &skip_modifications)
Compute the modifications of all ProteinHits given PeptideHits.
void sort()
Sorts the protein hits according to their score.
void insertIndistinguishableProteins(const ProteinGroup &group)
Appends new indistinguishable proteins.
String search_engine_
Definition: ProteinIdentification.h:482
std::vector< std::pair< String, String > > getSearchEngineSettingsAsPairs(const String &se="") const
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:237
@ AVERAGE
Definition: ProteinIdentification.h:239
@ MONOISOTOPIC
Definition: ProteinIdentification.h:238
ProteinIdentification()
Default constructor.
const std::vector< ProteinGroup > & getProteinGroups() const
Returns the protein groups.
void setSignificanceThreshold(double value)
Sets the protein significance threshold value.
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:492
const SearchParameters & getSearchParameters() const
Returns the search parameters.
void setInferenceEngineVersion(const String &inference_engine_version)
Sets the search engine version.
bool operator!=(const ProteinIdentification &rhs) const
Inequality operator.
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:55
String search_engine_version_
Definition: ProteinIdentification.h:483
void setSearchEngineVersion(const String &search_engine_version)
Sets the search engine version.
ProteinIdentification & operator=(const ProteinIdentification &)=default
Assignment operator.
void setHits(const std::vector< ProteinHit > &hits)
Sets the protein hits.
void computeCoverage(const ConsensusMap &cmap, bool use_unassigned_ids)
void getPrimaryMSRunPath(StringList &output, bool raw=false) const
double protein_significance_threshold_
Definition: ProteinIdentification.h:496
SearchParameters search_parameters_
Definition: ProteinIdentification.h:484
void fillIndistinguishableGroupsWithSingletons()
Appends singleton groups (with the current score) for every yet ungrouped protein hit.
void setScoreType(const String &type)
Sets the protein score type.
String protein_score_type_
Definition: ProteinIdentification.h:490
static StringList getAllNamesOfPeakMassType()
returns all peak mass type names known to OpenMS
bool higher_score_better_
Definition: ProteinIdentification.h:491
void fillEvidenceMapping_(std::unordered_map< String, std::set< PeptideEvidence > > &map_acc_2_evidence, const PeptideIdentificationList &pep_ids) const
std::vector< ProteinGroup > & getIndistinguishableProteins()
Returns the indistinguishable proteins (mutable)
void setPrimaryMSRunPath(const StringList &s, bool raw=false)
void computeModifications(const ConsensusMap &cmap, const StringList &skip_modifications, bool use_unassigned_ids)
DateTime date_
Definition: ProteinIdentification.h:485
std::vector< ProteinHit >::iterator findHit(const String &accession)
Finds a protein hit by accession (returns past-the-end iterator if not found)
void setInferenceEngine(const String &search_engine)
Sets the inference engine type.
void copyMetaDataOnly(const ProteinIdentification &)
Copies only metadata (no protein hits or protein groups)
std::vector< ProteinHit > & getHits()
Returns the protein hits (mutable)
bool peptideIDsMergeable(const ProteinIdentification &id_run, const String &experiment_type) const
bool isHigherScoreBetter() const
Returns true if a higher score represents a better score.
void computeCoverage(const PeptideIdentificationList &pep_ids)
Compute the coverage (in percent) of all ProteinHits given PeptideHits.
ProteinIdentification(ProteinIdentification &&)=default
Move constructor.
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:495
ProteinIdentification & operator=(ProteinIdentification &&)=default
Move assignment operator.
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:493
virtual ~ProteinIdentification()
Destructor.
void setDateTime(const DateTime &date)
Sets the date of the protein identification run.
void setSearchParameters(SearchParameters &&search_parameters)
Sets the search parameters (move)
String id_
Definition: ProteinIdentification.h:481
bool operator==(const ProteinIdentification &rhs) const
Equality operator.
void insertHit(ProteinHit &&input)
Appends a protein hit.
const String & getSearchEngine() const
Returns the type of search engine used.
void fillModMapping_(const PeptideIdentificationList &pep_ids, const StringList &skip_modifications, std::unordered_map< String, std::set< std::pair< Size, ResidueModification >>> &prot2mod) const
const String & getScoreType() const
Returns the protein score type.
void setPrimaryMSRunPath(const StringList &s, MSExperiment &e)
set the file path to the primary MS run but try to use the mzML annotated in the MSExperiment.
void addPrimaryMSRunPath(const StringList &s, bool raw=false)
void setSearchParameters(const SearchParameters &search_parameters)
Sets the search parameters.
std::vector< ProteinGroup > & getProteinGroups()
Returns the protein groups (mutable)
A more convenient string class.
Definition: String.h:34
unsigned int UInt
Unsigned integer type.
Definition: Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:44
const std::string ID_MERGE_INDEX
Definition: Constants.h:347
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:59
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:70
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:61
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:65
String getPrimaryMSRunPath(const PeptideIdentification &pepid) const
Definition: ProteinIdentification.h:94
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:60
Search parameters of the DB search.
Definition: ProteinIdentification.h:252
String db_version
The database version.
Definition: ProteinIdentification.h:254
std::pair< int, int > getChargeRange() const
returns the charge range from the search engine settings as a pair of ints
bool operator!=(const SearchParameters &rhs) const
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:262
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1,...
Definition: ProteinIdentification.h:266
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:255
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:258
SearchParameters & operator=(const SearchParameters &)=default
Assignment operator.
SearchParameters(const SearchParameters &)=default
Copy constructor.
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:256
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:265
bool operator==(const SearchParameters &rhs) const
bool mergeable(const ProteinIdentification::SearchParameters &sp, const String &experiment_type) const
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:261
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:264
SearchParameters(SearchParameters &&)=default
Move constructor.
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:263
SearchParameters & operator=(SearchParameters &&) &=default
Move assignment operator.
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:259
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:257
String db
The used database.
Definition: ProteinIdentification.h:253
int getChargeValue_(String &charge_str) const
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:260