OpenMS
IdentificationDataConverter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Hendrik Weisser $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
39 #include <OpenMS/FORMAT/MzTab.h>
42 
43 namespace OpenMS
44 {
45  class FeatureMap;
46 
47  class OPENMS_DLLAPI IdentificationDataConverter
48  {
49  public:
50 
52  static void importIDs(IdentificationData& id_data,
53  const std::vector<ProteinIdentification>& proteins,
54  const std::vector<PeptideIdentification>& peptides);
55 
61  static void exportIDs(const IdentificationData& id_data,
62  std::vector<ProteinIdentification>& proteins,
63  std::vector<PeptideIdentification>& peptides,
64  bool export_ids_wo_scores = false);
65 
67  static MzTab exportMzTab(const IdentificationData& id_data);
68 
70  static void importSequences(IdentificationData& id_data,
71  const std::vector<FASTAFile::FASTAEntry>& fasta,
74  const String& decoy_pattern = "");
75 
77  static void exportParentMatches(
78  const IdentificationData::ParentMatches& parent_matches, PeptideHit& hit);
79 
86  static void importFeatureIDs(FeatureMap& features, bool clear_original = true);
87 
94  static void exportFeatureIDs(FeatureMap& features, bool clear_original = true);
95 
102  static void importConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
103 
110  static void exportConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
111 
112  protected:
113 
114  using StepOpt = std::optional<IdentificationData::ProcessingStepRef>;
115 
118  {
119  bool operator()(const StepOpt& left, const StepOpt& right) const
120  {
121  // @TODO: should runs without associated step go first or last?
122  if (!left) return bool(right);
123  if (!right) return false;
124  return **left < **right;
125  }
126  };
127 
130  {
132  const PeptideIdentification& right) const
133  {
134  // @TODO: should IDs without RT go first or last?
135  if (left.hasRT())
136  {
137  if (right.hasRT())
138  {
139  if (right.getRT() != left.getRT())
140  {
141  return left.getRT() < right.getRT();
142  } // else: compare by m/z (below)
143  }
144  else
145  {
146  return false;
147  }
148  }
149  else if (right.hasRT())
150  {
151  return true;
152  }
153  // no RTs or same RTs -> try to compare by m/z:
154  if (left.hasMZ())
155  {
156  if (right.hasMZ())
157  {
158  return left.getMZ() < right.getMZ();
159  }
160  else
161  {
162  return false;
163  }
164  }
165  // if both PI's have nothing, return false (to ensure 'x < x' is false for strict weak ordering)
166  return right.hasMZ();
167  }
168  };
169 
171  template <typename MzTabSectionRow>
174  std::vector<MzTabSectionRow>& output,
175  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
176  {
177  MzTabSectionRow row;
178  row.accession.set(parent.accession);
179  exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
180  row.best_search_engine_score, score_map);
181  row.description.set(parent.description);
182  row.coverage.set(parent.coverage);
183  if (!parent.sequence.empty())
184  {
185  MzTabOptionalColumnEntry opt_seq;
186  opt_seq.first = "opt_sequence";
187  opt_seq.second.set(parent.sequence);
188  row.opt_.push_back(opt_seq);
189  }
190  output.push_back(row);
191  }
192 
194  template <typename MzTabSectionRow, typename IdentSeq>
196  const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
197  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
198  {
199  MzTabSectionRow row;
200  // @TODO: handle modifications properly
201  row.sequence.set(identified.sequence.toString());
202  exportStepsAndScoresToMzTab_(identified.steps_and_scores,
203  row.search_engine,
204  row.best_search_engine_score, score_map);
205  if (identified.parent_matches.empty()) // no parent information given
206  {
207  // row.unique.set(false); // leave this unset?
208  output.push_back(row);
209  }
210  else // generate entries (with duplicated data) for every accession
211  {
212  // in mzTab, "unique" means "peptide is unique for this protein"
213  row.unique.set(identified.parent_matches.size() == 1);
214  for (const auto& match_pair : identified.parent_matches)
215  {
216  row.accession.set(match_pair.first->accession);
217  for (const IdentificationData::ParentMatch& match :
218  match_pair.second)
219  {
220  MzTabSectionRow copy = row;
221  addMzTabMoleculeParentContext_(match, copy);
222  output.push_back(copy);
223  }
224  }
225  }
226  }
227 
229  template <typename MzTabSectionRow>
231  const String& sequence,
232  const IdentificationData::ObservationMatch& match, double calc_mass,
233  std::vector<MzTabSectionRow>& output,
234  std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
235  std::map<IdentificationData::InputFileRef, Size>& file_map)
236  {
237  MzTabSectionRow xsm; // PSM or OSM
238  // @TODO: handle modifications properly
239  xsm.sequence.set(sequence);
240  exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
241  xsm.search_engine_score, score_map);
242  const IdentificationData::Observation& query = *match.observation_ref;
243  std::vector<MzTabDouble> rts(1);
244  rts[0].set(query.rt);
245  xsm.retention_time.set(rts);
246  xsm.charge.set(match.charge);
247  xsm.exp_mass_to_charge.set(query.mz);
248  xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
249  xsm.spectra_ref.setMSFile(file_map[query.input_file]);
250  xsm.spectra_ref.setSpecRef(query.data_id);
251  // optional column for adduct:
252  if (match.adduct_opt)
253  {
254  MzTabOptionalColumnEntry opt_adduct;
255  opt_adduct.first = "opt_adduct";
256  opt_adduct.second.set((*match.adduct_opt)->getName());
257  xsm.opt_.push_back(opt_adduct);
258  }
259  // optional columns for isotope offset:
260  // @TODO: find a way of passing in the names of relevant meta values
261  // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
262  if (match.metaValueExists("isotope_offset"))
263  {
264  MzTabOptionalColumnEntry opt_meta;
265  opt_meta.first = "opt_isotope_offset";
266  opt_meta.second.set(match.getMetaValue("isotope_offset"));
267  xsm.opt_.push_back(opt_meta);
268  }
269  // don't repeat data from the peptide section (e.g. accessions)
270  // why are "pre"/"post"/"start"/"end" not in the peptide section?!
271  output.push_back(xsm);
272  }
273 
275  static void exportStepsAndScoresToMzTab_(
276  const IdentificationData::AppliedProcessingSteps& steps_and_scores,
277  MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
278  std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
279 
281  static void addMzTabSEScores_(
282  const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
283  std::map<Size, MzTabParameter>& output);
284 
286  static void addMzTabMoleculeParentContext_(
287  const IdentificationData::ParentMatch& match,
289 
291  static void addMzTabMoleculeParentContext_(
292  const IdentificationData::ParentMatch& match,
294 
296  static IdentificationData::SearchParamRef importDBSearchParameters_(
298  IdentificationData& id_data);
299 
301  static ProteinIdentification::SearchParameters exportDBSearchParameters_(
303 
305  static void exportMSRunInformation_(
307  ProteinIdentification& protein);
308 
309  static void handleFeatureImport_(Feature& feature, const IntList& indexes,
310  std::vector<PeptideIdentification>& peptides,
311  Size& id_counter, bool clear_original);
312 
313  static void handleFeatureExport_(Feature& feature, const IntList& indexes,
314  IdentificationData& id_data, Size& id_counter);
315  };
316 }
Representation of a protein identification run.
Definition: ProteinIdentification.h:74
double coverage
sequence coverage as a fraction between 0 and 1
Definition: ParentSequence.h:61
bool operator()(const StepOpt &left, const StepOpt &right) const
Definition: IdentificationDataConverter.h:119
A more convenient string class.
Definition: String.h:58
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:396
MoleculeType
Definition: MetaData.h:65
bool hasMZ() const
shortcut for isnan(getRT())
AppliedProcessingSteps steps_and_scores
Definition: ScoredProcessingResult.h:46
A container for features.
Definition: FeatureMap.h:98
Representation of a parent sequence that is identified only indirectly (e.g. a protein).
Definition: ParentSequence.h:49
Meta data for the association between an identified molecule (e.g. peptide) and a parent sequence (e...
Definition: ParentMatch.h:45
String data_id
Spectrum or feature ID (from the file referenced by input_file)
Definition: Observation.h:56
IdentificationDataInternal::ParentMatches ParentMatches
Definition: IdentificationData.h:164
Definition: IdentificationData.h:112
A container for consensus elements.
Definition: ConsensusMap.h:84
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:55
String sequence
Definition: ParentSequence.h:57
Definition: MzTabBase.h:268
Representation of an observation, e.g. a spectrum or feature, in an input data file.
Definition: Observation.h:53
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Definition: IdentificationDataConverter.h:47
Search parameters of the DB search.
Definition: ProteinIdentification.h:271
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
double getMZ() const
returns the MZ of the MS2 spectrum
static void exportObservationMatchToMzTab_(const String &sequence, const IdentificationData::ObservationMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export an input match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition: IdentificationDataConverter.h:230
InputFileRef input_file
Reference to the input file.
Definition: Observation.h:59
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition: IdentificationData.h:153
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
Representation of a peptide hit.
Definition: PeptideHit.h:55
bool hasRT() const
shortcut for isnan(getRT())
An LC-MS feature.
Definition: Feature.h:70
String accession
Definition: ParentSequence.h:51
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab...
Definition: IdentificationDataConverter.h:195
double mz
Definition: Observation.h:61
AdductOpt adduct_opt
optional reference to adduct
Definition: ObservationMatch.h:81
Representation of a search hit (e.g. peptide-spectrum match).
Definition: ObservationMatch.h:73
Functor for ordering peptide IDs by RT and m/z (if available)
Definition: IdentificationDataConverter.h:129
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
bool operator()(const PeptideIdentification &left, const PeptideIdentification &right) const
Definition: IdentificationDataConverter.h:131
Int charge
Definition: ObservationMatch.h:79
static void exportParentSequenceToMzTab_(const IdentificationData::ParentSequence &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent sequence (protein or nucleic acid) to mzTab.
Definition: IdentificationDataConverter.h:172
Functor for ordering StepOpt (by date of the steps, if available):
Definition: IdentificationDataConverter.h:117
String description
Definition: ParentSequence.h:59
double rt
Definition: Observation.h:61
PEP - Peptide section (Table based)
Definition: MzTab.h:242
ObservationRef observation_ref
Definition: ObservationMatch.h:77
std::optional< IdentificationData::ProcessingStepRef > StepOpt
Definition: IdentificationDataConverter.h:114
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:45
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition: MzTabBase.h:229
Data model of MzTab files. Please see the official MzTab specification at https://code.google.com/p/mztab/.
Definition: MzTab.h:477