OpenMS  2.8.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <set>
45 
46 namespace OpenMS
47 {
48  class MSExperiment;
49  class PeptideIdentification;
50 
70  class OPENMS_DLLAPI ProteinIdentification :
71  public MetaInfoInterface
72  {
73 public:
76 
78  struct Mapping
79  {
80  std::map<String, StringList> identifier_to_msrunpath;
81  std::map<StringList, String> runpath_to_identifier;
82 
83  Mapping() = default;
84 
85  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
86  {
87  create(prot_ids);
88  }
89  void create(const std::vector<ProteinIdentification>& prot_ids)
90  {
91  identifier_to_msrunpath.clear();
92  runpath_to_identifier.clear();
93  StringList filenames;
94  for (const ProteinIdentification& prot_id : prot_ids)
95  {
96  prot_id.getPrimaryMSRunPath(filenames);
97  if (filenames.empty())
98  {
99  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
100  }
101  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
102  const auto& it = runpath_to_identifier.find(filenames);
103  if (it != runpath_to_identifier.end())
104  {
105  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
106  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
107  ListUtils::concatenate(filenames, ","));
108  }
109  runpath_to_identifier[filenames] = prot_id.getIdentifier();
110  }
111  }
112  };
113 
117  class OPENMS_DLLAPI ProteinGroup
118  {
119  public:
122  typedef std::vector<FloatDataArray> FloatDataArrays;
125  typedef std::vector<StringDataArray> StringDataArrays;
128  typedef std::vector<IntegerDataArray> IntegerDataArrays;
129 
131  double probability;
132 
134  std::vector<String> accessions;
135 
137 
139  bool operator==(const ProteinGroup& rhs) const;
140 
141  /*
142  @brief Comparison operator (for sorting)
143 
144  This operator is intended for sorting protein groups in a "best first"
145  manner. That means higher probabilities are "less" than lower
146  probabilities (!); smaller groups are "less" than larger groups;
147  everything else being equal, accessions are compared lexicographically.
148  */
149  bool operator<(const ProteinGroup& rhs) const;
150 
152 
165 
168  {
169  return float_data_arrays_;
170  }
171 
174 
177 
180 
183 
186 
189 
192 
195  {
196  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
197  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
198  }
199 
202  {
203  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
204  [&name](const StringDataArray& da) { return da.getName() == name; } );
205  }
206 
209  {
210  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
211  [&name](const FloatDataArray& da) { return da.getName() == name; } );
212  }
213 
216  {
217  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
218  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
219  }
220 
223  {
224  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
225  [&name](const StringDataArray& da) { return da.getName() == name; } );
226  }
227 
230  {
231  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
232  [&name](const FloatDataArray& da) { return da.getName() == name; } );
233  }
234 
235  private:
238 
241 
244  };
245 
248  {
251  SIZE_OF_PEAKMASSTYPE
252  };
253 
255  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
256 
258  struct OPENMS_DLLAPI SearchParameters :
259  public MetaInfoInterface
260  {
266  std::vector<String> fixed_modifications;
267  std::vector<String> variable_modifications;
275 
282  ~SearchParameters() = default;
283 
288 
289  bool operator==(const SearchParameters& rhs) const;
290 
291  bool operator!=(const SearchParameters& rhs) const;
292 
294  std::pair<int,int> getChargeRange() const;
295 
300  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
301 
302  private:
303  int getChargeValue_(String& charge_str) const;
304  };
305 
316 
321 
323  bool operator==(const ProteinIdentification& rhs) const;
325  bool operator!=(const ProteinIdentification& rhs) const;
327 
329 
330  const std::vector<ProteinHit>& getHits() const;
333  std::vector<ProteinHit>& getHits();
335  void insertHit(const ProteinHit& input);
337  void insertHit(ProteinHit&& input);
338 
344  void setHits(const std::vector<ProteinHit>& hits);
345 
347  std::vector<ProteinHit>::iterator findHit(const String& accession);
348 
350  const std::vector<ProteinGroup>& getProteinGroups() const;
352  std::vector<ProteinGroup>& getProteinGroups();
354  void insertProteinGroup(const ProteinGroup& group);
355 
357  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
359  std::vector<ProteinGroup>& getIndistinguishableProteins();
364 
366  double getSignificanceThreshold() const;
368  void setSignificanceThreshold(double value);
370  const String& getScoreType() const;
372  void setScoreType(const String& type);
374  bool isHigherScoreBetter() const;
376  void setHigherScoreBetter(bool higher_is_better);
378  void sort();
380  void assignRanks();
388  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
390 
398  const std::vector<PeptideIdentification>& pep_ids,
399  const StringList& skip_modifications);
400 
401 
403 
404  const DateTime& getDateTime() const;
407  void setDateTime(const DateTime& date);
409  void setSearchEngine(const String& search_engine);
411  const String& getSearchEngine() const;
415  void setSearchEngineVersion(const String& search_engine_version);
419  void setInferenceEngine(const String& search_engine);
421  const String getInferenceEngine() const;
423  void setInferenceEngineVersion(const String& inference_engine_version);
427  void setSearchParameters(const SearchParameters& search_parameters);
429  void setSearchParameters(SearchParameters&& search_parameters);
435  const String& getIdentifier() const;
437  void setIdentifier(const String& id);
443  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
444 
447  void addPrimaryMSRunPath(const String& s, bool raw = false);
448  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
449 
455  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
456 
458  Size nrPrimaryMSRunPaths(bool raw = false) const;
459 
462  bool hasInferenceData() const;
463 
466 
470  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
471 
474  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
475 
477 
478 protected:
480 
487 
489 
492  std::vector<ProteinHit> protein_hits_;
493  std::vector<ProteinGroup> protein_groups_;
495  std::vector<ProteinGroup> indistinguishable_proteins_;
498  };
499 
500 } //namespace OpenMS
Float data array class.
Definition: DataArrays.h:48
Integer data array class.
Definition: DataArrays.h:55
String data array class.
Definition: DataArrays.h:62
DateTime Class.
Definition: DateTime.h:55
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:50
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:69
Invalid value exception.
Definition: Exception.h:329
Not all required information provided.
Definition: Exception.h:188
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:209
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:73
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:61
Representation of a protein hit.
Definition: ProteinHit.h:60
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:118
void setIntegerDataArrays(const IntegerDataArrays &ida)
Sets the integer meta data arrays.
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:167
double probability
Probability of this group.
Definition: ProteinIdentification.h:131
bool operator<(const ProteinGroup &rhs) const
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:194
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:125
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:121
StringDataArrays & getStringDataArrays()
Returns a mutable reference to the string meta data arrays.
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:215
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:134
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:208
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:240
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:201
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:222
const IntegerDataArrays & getIntegerDataArrays() const
Returns a const reference to the integer meta data arrays.
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:243
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:124
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:237
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:229
const FloatDataArrays & getFloatDataArrays() const
Returns a const reference to the float meta data arrays.
void setStringDataArrays(const StringDataArrays &sda)
Sets the string meta data arrays.
const StringDataArrays & getStringDataArrays() const
Returns a const reference to the string meta data arrays.
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:122
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:127
bool operator==(const ProteinGroup &rhs) const
Equality operator.
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:128
IntegerDataArrays & getIntegerDataArrays()
Returns a mutable reference to the integer meta data arrays.
void setFloatDataArrays(const FloatDataArrays &fda)
Sets the float meta data arrays.
Representation of a protein identification run.
Definition: ProteinIdentification.h:72
void setIdentifier(const String &id)
Sets the identifier.
void computeModifications(const std::vector< PeptideIdentification > &pep_ids, const StringList &skip_modifications)
Compute the modifications of all ProteinHits given PeptideHits.
ProteinIdentification(const ProteinIdentification &)=default
Copy constructor.
const String & getIdentifier() const
Returns the identifier.
void insertProteinGroup(const ProteinGroup &group)
Appends a new protein group.
void insertHit(const ProteinHit &input)
Appends a protein hit.
const String getInferenceEngineVersion() const
Returns the search engine version.
void setSearchEngine(const String &search_engine)
Sets the search engine type.
void addPrimaryMSRunPath(const String &s, bool raw=false)
SearchParameters & getSearchParameters()
Returns the search parameters (mutable)
const String getOriginalSearchEngineName() const
Return the type of search engine that was first applied (e.g., before percolator or consensusID) or "...
bool hasInferenceEngineAsSearchEngine() const
Checks if the search engine name matches an inference engine known to OpenMS.
void setHigherScoreBetter(bool higher_is_better)
Sets the orientation of the score (is higher better?)
const std::vector< ProteinGroup > & getIndistinguishableProteins() const
Returns the indistinguishable proteins.
Size nrPrimaryMSRunPaths(bool raw=false) const
get the number of primary MS runs involve in this ID run
const String & getSearchEngineVersion() const
Returns the search engine version.
double getSignificanceThreshold() const
Returns the protein significance threshold value.
const String getInferenceEngine() const
Returns the type of search engine used.
void sort()
Sorts the protein hits according to their score.
void insertIndistinguishableProteins(const ProteinGroup &group)
Appends new indistinguishable proteins.
String search_engine_
Definition: ProteinIdentification.h:482
std::vector< std::pair< String, String > > getSearchEngineSettingsAsPairs(const String &se="") const
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:248
@ AVERAGE
Definition: ProteinIdentification.h:250
@ MONOISOTOPIC
Definition: ProteinIdentification.h:249
ProteinIdentification()
Default constructor.
const std::vector< ProteinGroup > & getProteinGroups() const
Returns the protein groups.
void setSignificanceThreshold(double value)
Sets the protein significance threshold value.
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:492
const SearchParameters & getSearchParameters() const
Returns the search parameters.
void setInferenceEngineVersion(const String &inference_engine_version)
Sets the search engine version.
bool operator!=(const ProteinIdentification &rhs) const
Inequality operator.
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:75
String search_engine_version_
Definition: ProteinIdentification.h:483
void setSearchEngineVersion(const String &search_engine_version)
Sets the search engine version.
ProteinIdentification & operator=(const ProteinIdentification &)=default
Assignment operator.
void setHits(const std::vector< ProteinHit > &hits)
Sets the protein hits.
void getPrimaryMSRunPath(StringList &output, bool raw=false) const
double protein_significance_threshold_
Definition: ProteinIdentification.h:496
SearchParameters search_parameters_
Definition: ProteinIdentification.h:484
void fillIndistinguishableGroupsWithSingletons()
Appends singleton groups (with the current score) for every yet ungrouped protein hit.
void setScoreType(const String &type)
Sets the protein score type.
String protein_score_type_
Definition: ProteinIdentification.h:490
bool higher_score_better_
Definition: ProteinIdentification.h:491
std::vector< ProteinGroup > & getIndistinguishableProteins()
Returns the indistinguishable proteins (mutable)
void setPrimaryMSRunPath(const StringList &s, bool raw=false)
void assignRanks()
Sorts the protein hits by score and assigns ranks (best score has rank 1)
DateTime date_
Definition: ProteinIdentification.h:485
std::vector< ProteinHit >::iterator findHit(const String &accession)
Finds a protein hit by accession (returns past-the-end iterator if not found)
void setInferenceEngine(const String &search_engine)
Sets the inference engine type.
void computeCoverage(const std::vector< PeptideIdentification > &pep_ids)
Compute the coverage (in percent) of all ProteinHits given PeptideHits.
std::vector< ProteinHit > & getHits()
Returns the protein hits (mutable)
bool peptideIDsMergeable(const ProteinIdentification &id_run, const String &experiment_type) const
bool isHigherScoreBetter() const
Returns true if a higher score represents a better score.
ProteinIdentification(ProteinIdentification &&)=default
Move constructor.
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:495
ProteinIdentification & operator=(ProteinIdentification &&)=default
Move assignment operator.
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:493
virtual ~ProteinIdentification()
Destructor.
void setDateTime(const DateTime &date)
Sets the date of the protein identification run.
void setSearchParameters(SearchParameters &&search_parameters)
Sets the search parameters (move)
String id_
Definition: ProteinIdentification.h:481
bool operator==(const ProteinIdentification &rhs) const
Equality operator.
void insertHit(ProteinHit &&input)
Appends a protein hit.
const String & getSearchEngine() const
Returns the type of search engine used.
const String & getScoreType() const
Returns the protein score type.
void setPrimaryMSRunPath(const StringList &s, MSExperiment &e)
set the file path to the primary MS run but try to use the mzML annotated in the MSExperiment.
void addPrimaryMSRunPath(const StringList &s, bool raw=false)
void setSearchParameters(const SearchParameters &search_parameters)
Sets the search parameters.
std::vector< ProteinGroup > & getProteinGroups()
Returns the protein groups (mutable)
A more convenient string class.
Definition: String.h:60
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:79
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:89
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:81
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:85
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:80
Search parameters of the DB search.
Definition: ProteinIdentification.h:260
String db_version
The database version.
Definition: ProteinIdentification.h:262
std::pair< int, int > getChargeRange() const
returns the charge range from the search engine settings as a pair of ints
bool operator!=(const SearchParameters &rhs) const
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:270
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1,...
Definition: ProteinIdentification.h:274
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:263
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:266
SearchParameters & operator=(const SearchParameters &)=default
Assignment operator.
SearchParameters(const SearchParameters &)=default
Copy constructor.
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:264
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:273
bool operator==(const SearchParameters &rhs) const
bool mergeable(const ProteinIdentification::SearchParameters &sp, const String &experiment_type) const
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:269
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:272
SearchParameters(SearchParameters &&)=default
Move constructor.
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:271
SearchParameters & operator=(SearchParameters &&) &=default
Move assignment operator.
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:267
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:265
String db
The used database.
Definition: ProteinIdentification.h:261
int getChargeValue_(String &charge_str) const
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:268