OpenMS  3.0.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
45 
46 #include <set>
47 
48 namespace OpenMS
49 {
50  class MSExperiment;
51  class PeptideIdentification;
52  class PeptideEvidence;
53  class ConsensusMap;
54 
74  class OPENMS_DLLAPI ProteinIdentification :
75  public MetaInfoInterface
76  {
77 public:
80 
82  struct Mapping
83  {
84  std::map<String, StringList> identifier_to_msrunpath;
85  std::map<StringList, String> runpath_to_identifier;
86 
87  Mapping() = default;
88 
89  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
90  {
91  create(prot_ids);
92  }
93 
94  void create(const std::vector<ProteinIdentification>& prot_ids)
95  {
96  identifier_to_msrunpath.clear();
97  runpath_to_identifier.clear();
98  StringList filenames;
99  for (const ProteinIdentification& prot_id : prot_ids)
100  {
101  prot_id.getPrimaryMSRunPath(filenames);
102  if (filenames.empty())
103  {
104  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
105  }
106  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
107  const auto& it = runpath_to_identifier.find(filenames);
108  if (it != runpath_to_identifier.end())
109  {
110  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
111  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
112  ListUtils::concatenate(filenames, ","));
113  }
114  runpath_to_identifier[filenames] = prot_id.getIdentifier();
115  }
116  }
117 
119  {
120  // if a merge index n is annotated, we use the filename annotated at index n in the protein identification, otherwise the one at index 0
121  size_t merge_index = pepid.getMetaValue(Constants::UserParam::ID_MERGE_INDEX, 0);
122  const auto& filenames = identifier_to_msrunpath.at(pepid.getIdentifier());
123  return (merge_index < filenames.size()) ? filenames[merge_index] : ""; // return filename or empty string if missing
124  }
125  };
126 
130  class OPENMS_DLLAPI ProteinGroup
131  {
132  public:
135  typedef std::vector<FloatDataArray> FloatDataArrays;
138  typedef std::vector<StringDataArray> StringDataArrays;
141  typedef std::vector<IntegerDataArray> IntegerDataArrays;
142 
144  double probability;
145 
147  std::vector<String> accessions;
148 
149  ProteinGroup();
150 
152  bool operator==(const ProteinGroup& rhs) const;
153 
154  /*
155  @brief Comparison operator (for sorting)
156 
157  This operator is intended for sorting protein groups in a "best first"
158  manner. That means higher probabilities are "less" than lower
159  probabilities (!); smaller groups are "less" than larger groups;
160  everything else being equal, accessions are compared lexicographically.
161  */
162  bool operator<(const ProteinGroup& rhs) const;
163 
165 
176  const FloatDataArrays& getFloatDataArrays() const;
178 
181  {
182  return float_data_arrays_;
183  }
184 
186  void setFloatDataArrays(const FloatDataArrays& fda);
187 
189  const StringDataArrays& getStringDataArrays() const;
190 
192  StringDataArrays& getStringDataArrays();
193 
195  void setStringDataArrays(const StringDataArrays& sda);
196 
198  const IntegerDataArrays& getIntegerDataArrays() const;
199 
201  IntegerDataArrays& getIntegerDataArrays();
202 
204  void setIntegerDataArrays(const IntegerDataArrays& ida);
205 
208  {
209  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
210  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
211  }
212 
215  {
216  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
217  [&name](const StringDataArray& da) { return da.getName() == name; } );
218  }
219 
222  {
223  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
224  [&name](const FloatDataArray& da) { return da.getName() == name; } );
225  }
226 
229  {
230  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
231  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
232  }
233 
236  {
237  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
238  [&name](const StringDataArray& da) { return da.getName() == name; } );
239  }
240 
243  {
244  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
245  [&name](const FloatDataArray& da) { return da.getName() == name; } );
246  }
247 
248  private:
251 
254 
257  };
258 
261  {
264  SIZE_OF_PEAKMASSTYPE
265  };
266 
268  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
269 
271  struct OPENMS_DLLAPI SearchParameters :
272  public MetaInfoInterface
273  {
279  std::vector<String> fixed_modifications;
280  std::vector<String> variable_modifications;
288 
291  SearchParameters(const SearchParameters&) = default;
293  SearchParameters(SearchParameters&&) = default;
295  ~SearchParameters() = default;
296 
298  SearchParameters& operator=(const SearchParameters&) = default;
300  SearchParameters& operator=(SearchParameters&&)& = default;
301 
302  bool operator==(const SearchParameters& rhs) const;
303 
304  bool operator!=(const SearchParameters& rhs) const;
305 
307  std::pair<int,int> getChargeRange() const;
308 
313  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
314 
315  private:
316  int getChargeValue_(String& charge_str) const;
317  };
318 
328  virtual ~ProteinIdentification();
329 
331  ProteinIdentification& operator=(const ProteinIdentification&) = default;
333  ProteinIdentification& operator=(ProteinIdentification&&) = default;
334 
336  bool operator==(const ProteinIdentification& rhs) const;
338  bool operator!=(const ProteinIdentification& rhs) const;
340 
342 
343  const std::vector<ProteinHit>& getHits() const;
346  std::vector<ProteinHit>& getHits();
348  void insertHit(const ProteinHit& input);
350  void insertHit(ProteinHit&& input);
351 
357  void setHits(const std::vector<ProteinHit>& hits);
358 
360  std::vector<ProteinHit>::iterator findHit(const String& accession);
361 
363  const std::vector<ProteinGroup>& getProteinGroups() const;
365  std::vector<ProteinGroup>& getProteinGroups();
367  void insertProteinGroup(const ProteinGroup& group);
368 
370  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
372  std::vector<ProteinGroup>& getIndistinguishableProteins();
374  void insertIndistinguishableProteins(const ProteinGroup& group);
376  void fillIndistinguishableGroupsWithSingletons();
377 
379  double getSignificanceThreshold() const;
381  void setSignificanceThreshold(double value);
383  const String& getScoreType() const;
385  void setScoreType(const String& type);
387  bool isHigherScoreBetter() const;
389  void setHigherScoreBetter(bool higher_is_better);
391  void sort();
393  void assignRanks();
401  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
402  void computeCoverage(const ConsensusMap& cmap, bool use_unassigned_ids);
404 
411  void computeModifications(
412  const std::vector<PeptideIdentification>& pep_ids,
413  const StringList& skip_modifications);
414  void computeModifications(
415  const ConsensusMap& cmap,
416  const StringList& skip_modifications,
417  bool use_unassigned_ids);
418 
419 
421 
422  const DateTime& getDateTime() const;
425  void setDateTime(const DateTime& date);
427  void setSearchEngine(const String& search_engine);
429  const String& getSearchEngine() const;
431  const String getOriginalSearchEngineName() const;
433  void setSearchEngineVersion(const String& search_engine_version);
435  const String& getSearchEngineVersion() const;
437  void setInferenceEngine(const String& search_engine);
439  const String getInferenceEngine() const;
441  void setInferenceEngineVersion(const String& inference_engine_version);
443  const String getInferenceEngineVersion() const;
445  void setSearchParameters(const SearchParameters& search_parameters);
447  void setSearchParameters(SearchParameters&& search_parameters);
449  const SearchParameters& getSearchParameters() const;
451  SearchParameters& getSearchParameters();
453  const String& getIdentifier() const;
455  void setIdentifier(const String& id);
461  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
462 
464  void setPrimaryMSRunPath(const StringList& s, MSExperiment& e);
465  void addPrimaryMSRunPath(const String& s, bool raw = false);
466  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
467 
473  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
474 
476  Size nrPrimaryMSRunPaths(bool raw = false) const;
477 
480  bool hasInferenceData() const;
481 
483  bool hasInferenceEngineAsSearchEngine() const;
484 
488  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
489 
492  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
493 
495 
497  void copyMetaDataOnly(const ProteinIdentification&);
498 protected:
500 
507 
509 
512  std::vector<ProteinHit> protein_hits_;
513  std::vector<ProteinGroup> protein_groups_;
515  std::vector<ProteinGroup> indistinguishable_proteins_;
518 
519  private:
520  void computeCoverageFromEvidenceMapping_(const std::unordered_map<String, std::set<PeptideEvidence>>& map);
521  void fillEvidenceMapping_(std::unordered_map<String, std::set<PeptideEvidence> >& map_acc_2_evidence,
522  const std::vector<PeptideIdentification>& pep_ids) const;
523 
524  void fillModMapping_(const std::vector<PeptideIdentification>& pep_ids, const StringList& skip_modifications,
525  std::unordered_map<String, std::set<std::pair<Size, ResidueModification>>>& prot2mod) const;
526  };
527 
528 
529 } //namespace OpenMS
Representation of a protein identification run.
Definition: ProteinIdentification.h:74
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:135
String db
The used database.
Definition: ProteinIdentification.h:274
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:138
A more convenient string class.
Definition: String.h:58
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:256
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:214
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:279
String db_version
The database version.
Definition: ProteinIdentification.h:275
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:84
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:207
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:94
Definition: ProteinIdentification.h:263
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:282
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:235
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:250
String search_engine_version_
Definition: ProteinIdentification.h:503
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:284
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:180
A container for consensus elements.
Definition: ConsensusMap.h:83
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:85
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:134
bool operator==(const IDBoostGraph::ProteinGroup &lhs, const IDBoostGraph::ProteinGroup &rhs)
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:79
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:89
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Search parameters of the DB search.
Definition: ProteinIdentification.h:271
bool operator<(const MultiplexDeltaMasses &dm1, const MultiplexDeltaMasses &dm2)
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Float data array class.
Definition: DataArrays.h:45
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:285
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:276
Integer data array class.
Definition: DataArrays.h:52
String getPrimaryMSRunPath(const PeptideIdentification &pepid) const
Definition: ProteinIdentification.h:118
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:260
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:221
const std::string ID_MERGE_INDEX
Definition: Constants.h:323
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:228
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:147
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:67
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:512
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:209
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:286
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1, or full=2)
Definition: ProteinIdentification.h:287
String protein_score_type_
Definition: ProteinIdentification.h:510
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:141
double protein_significance_threshold_
Definition: ProteinIdentification.h:516
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:82
String id_
Definition: ProteinIdentification.h:501
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:281
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:60
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:513
String search_engine_
Definition: ProteinIdentification.h:502
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
Representation of a protein hit.
Definition: ProteinHit.h:58
Invalid value exception.
Definition: Exception.h:327
bool higher_score_better_
Definition: ProteinIdentification.h:511
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:70
DateTime date_
Definition: ProteinIdentification.h:505
Definition: ProteinIdentification.h:262
bool operator!=(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:824
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
DateTime Class.
Definition: DateTime.h:58
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name. ...
Definition: ProteinIdentification.h:242
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:283
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:515
String data array class.
Definition: DataArrays.h:59
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:278
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:130
SearchParameters search_parameters_
Definition: ProteinIdentification.h:504
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:137
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:140
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:277
double probability
Probability of this group.
Definition: ProteinIdentification.h:144
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:253
const String & getIdentifier() const
Returns the identifier which links this PI to its corresponding ProteinIdentification.
Not all required information provided.
Definition: Exception.h:186
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:48
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:280