OpenMS  3.0.0
FeatureFinderIdentificationAlgorithm.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #ifndef OPENMS_TRANSFORMATIONS_FEATUREFINDER_FEATUREFINDERIDENTIFICATIONALGORITHM_H
36 #define OPENMS_TRANSFORMATIONS_FEATUREFINDER_FEATUREFINDERIDENTIFICATIONALGORITHM_H
37 
45 
46 #include <vector>
47 #include <fstream>
48 #include <map>
49 
50 namespace OpenMS
51 {
52  class IsotopeDistribution;
53 
54 
55 
57  public DefaultParamHandler
58 {
59 public:
62 
75  void run(
76  std::vector<PeptideIdentification> peptides,
77  const std::vector<ProteinIdentification>& proteins,
78  std::vector<PeptideIdentification> peptides_ext,
79  std::vector<ProteinIdentification> proteins_ext,
80  FeatureMap& features,
81  const FeatureMap& seeds = FeatureMap(),
82  const String& spectra_file = ""
83  );
84 
85  void runOnCandidates(FeatureMap& features);
86 
87  PeakMap& getMSData();
88  const PeakMap& getMSData() const;
89 
91  void setMSData(const PeakMap& ms_data); // for pyOpenMS
92  void setMSData(PeakMap&& ms_data); // moves peak data and saves the copy. Note that getMSData() will give back a processed/modified version.
93 
94  PeakMap& getChromatograms();
95  const PeakMap& getChromatograms() const;
96 
97  ProgressLogger& getProgressLogger();
98  const ProgressLogger& getProgressLogger() const;
99 
100  TargetedExperiment& getLibrary();
101  const TargetedExperiment& getLibrary() const;
102 
103 protected:
106 
108  typedef std::multimap<double, PeptideIdentification*> RTMap;
110  typedef std::map<Int, std::pair<RTMap, RTMap> > ChargeMap;
112  typedef std::map<AASequence, ChargeMap> PeptideMap;
114  typedef std::map<String, std::pair<RTMap, RTMap> > PeptideRefRTMap;
115 
117 
120 
122  double rt_window_;
123  double mz_window_;
125 
127 
128  double isotope_pmin_;
130 
131  double rt_quantile_;
132 
133  double peak_width_;
136 
138 
139  // SVM related parameters
146 
147  // output file (before filtering)
149 
151 
152  void updateMembers_() override;
153 
155  struct RTRegion
156  {
157  double start, end;
159  };
160 
163  {
164  bool operator()(const Feature& feature)
165  {
166  return feature.getOverallQuality() == 0.0;
167  }
168  } feature_filter_quality_;
169 
172  {
173  bool operator()(const Feature& feature)
174  {
175  return feature.getPeptideIdentifications().empty();
176  }
177  } feature_filter_peptides_;
178 
181  {
183  const PeptideIdentification& p2)
184  {
185  const String& seq1 = p1.getHits()[0].getSequence().toString();
186  const String& seq2 = p2.getHits()[0].getSequence().toString();
187  if (seq1 == seq2)
188  {
189  Int charge1 = p1.getHits()[0].getCharge();
190  Int charge2 = p2.getHits()[0].getCharge();
191  if (charge1 == charge2)
192  {
193  return p1.getRT() < p2.getRT();
194  }
195  return charge1 < charge2;
196  }
197  return seq1 < seq2;
198  }
199  } peptide_compare_;
200 
203  {
204  bool operator()(const Feature& f1, const Feature& f2)
205  {
206  const String& ref1 = f1.getMetaValue("PeptideRef");
207  const String& ref2 = f2.getMetaValue("PeptideRef");
208  if (ref1 == ref2)
209  {
210  return f1.getRT() < f2.getRT();
211  }
212  return ref1 < ref2;
213  }
214  } feature_compare_;
215 
219 
223  std::vector<PeptideIdentification> unassignedIDs_;
224 
225  const double seed_rt_window_ = 60.0;
226 
228  std::map<double, std::pair<Size, Size> > svm_probs_internal_;
230  std::multiset<double> svm_probs_external_;
233  TransformationDescription trafo_external_;
235  std::map<String, double> isotope_probs_;
237 
239 
241  void generateTransitions_(const String& peptide_id, double mz, Int charge,
242  const IsotopeDistribution& iso_dist);
243 
244  void addPeptideRT_(TargetedExperiment::Peptide& peptide, double rt) const;
245 
247  void getRTRegions_(ChargeMap& peptide_data, std::vector<RTRegion>& rt_regions, bool clear_IDs = true) const;
248 
249  void annotateFeaturesFinalizeAssay_(
250  FeatureMap& features,
251  std::map<Size, std::vector<PeptideIdentification*> >& feat_ids,
252  RTMap& rt_internal);
253 
255  void annotateFeatures_(FeatureMap& features, PeptideRefRTMap& ref_rt_map);
256 
257  void ensureConvexHulls_(Feature& feature) const;
258 
259  void postProcess_(FeatureMap& features, bool with_external_ids);
260 
262  void statistics_(const FeatureMap& features) const;
263 
267  void createAssayLibrary_(const PeptideMap::iterator& begin, const PeptideMap::iterator& end, PeptideRefRTMap& ref_rt_map, bool clear_IDs = true);
268 
272  void addPeptideToMap_(PeptideIdentification& peptide,
273  PeptideMap& peptide_map,
274  bool external = false);
275 
276  void checkNumObservations_(Size n_pos, Size n_neg, const String& note = "") const;
277 
278  void getUnbiasedSample_(const std::multimap<double, std::pair<Size, bool> >& valid_obs,
279  std::map<Size, double>& training_labels);
280 
281  void getRandomSample_(std::map<Size, double>& training_labels) const;
282 
283  void classifyFeatures_(FeatureMap& features);
284 
285  void filterFeaturesFinalizeAssay_(Feature& best_feature, double best_quality,
286  const double quality_cutoff);
287 
288  void filterFeatures_(FeatureMap& features, bool classified);
289 
290  void calculateFDR_(FeatureMap& features);
291 
294  template <typename It>
295  std::vector<std::pair<It,It>>
296  chunk_(It range_from, It range_to, const std::ptrdiff_t batch_size)
297  {
298  /* Aliases, to make the rest of the code more readable. */
299  using std::vector;
300  using std::pair;
301  using std::make_pair;
302  using std::distance;
303  using diff_t = std::ptrdiff_t;
304 
305  /* Total item number and batch_size size. */
306  const diff_t total {distance(range_from, range_to)};
307  const diff_t num {total / batch_size};
308 
309  vector<pair<It,It>> chunks(num);
310 
311  It batch_end {range_from};
312 
313  /* Use the 'generate' algorithm to create batches. */
314  std::generate(begin(chunks), end(chunks), [&batch_end, batch_size]()
315  {
316  It batch_start {batch_end };
317 
318  std::advance(batch_end, batch_size);
319  return make_pair(batch_start, batch_end);
320  });
321 
322  /* The last batch_size's end must always be 'range_to'. */
323  if (chunks.empty())
324  {
325  chunks.emplace_back(range_from, range_to);
326  }
327  else
328  {
329  chunks.back().second = range_to;
330  }
331 
332  return chunks;
333  }
334 };
335 
336 } // namespace OpenMS
337 
338 #endif
339 
QualityType getOverallQuality() const
Non-mutable access to the overall quality.
double svm_quality_cutoff
Definition: FeatureFinderIdentificationAlgorithm.h:143
std::multimap< double, PeptideIdentification * > RTMap
mapping: RT (not necessarily unique) -> pointer to peptide
Definition: FeatureFinderIdentificationAlgorithm.h:108
std::vector< PeptideIdentification > unassignedIDs_
Definition: FeatureFinderIdentificationAlgorithm.h:223
A more convenient string class.
Definition: String.h:58
region in RT in which a peptide elutes:
Definition: FeatureFinderIdentificationAlgorithm.h:155
bool operator()(const PeptideIdentification &p1, const PeptideIdentification &p2)
Definition: FeatureFinderIdentificationAlgorithm.h:182
double mapping_tolerance_
RT tolerance for mapping IDs to features.
Definition: FeatureFinderIdentificationAlgorithm.h:126
Size n_internal_features_
internal feature counter (for FDR calculation)
Definition: FeatureFinderIdentificationAlgorithm.h:231
bool operator()(const Feature &f1, const Feature &f2)
Definition: FeatureFinderIdentificationAlgorithm.h:204
Helper struct for a collection of mass traces used in FeatureFinderAlgorithmPicked.
Definition: FeatureFinderAlgorithmPickedHelperStructs.h:109
A container for features.
Definition: FeatureMap.h:98
const std::vector< PeptideHit > & getHits() const
returns the peptide hits as const
Definition: FeatureFinderIdentificationAlgorithm.h:56
The MRMFeatureFinder finds and scores peaks of transitions that co-elute.
Definition: MRMFeatureFinderScoring.h:89
Definition: IsotopeDistribution.h:64
PeakMap chrom_data_
accumulated chromatograms (XICs)
Definition: FeatureFinderIdentificationAlgorithm.h:217
FeatureFinderAlgorithmPickedHelperStructs::MassTraces MassTraces
Definition: FeatureFinderIdentificationAlgorithm.h:105
StringList svm_predictor_names_
Definition: FeatureFinderIdentificationAlgorithm.h:141
const std::vector< PeptideIdentification > & getPeptideIdentifications() const
double min_peak_width_
Definition: FeatureFinderIdentificationAlgorithm.h:134
Size n_internal_peps_
number of internal peptide
Definition: FeatureFinderIdentificationAlgorithm.h:118
bool operator()(const Feature &feature)
Definition: FeatureFinderIdentificationAlgorithm.h:173
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Size n_isotopes_
number of isotopes for peptide assay
Definition: FeatureFinderIdentificationAlgorithm.h:129
String elution_model_
Definition: FeatureFinderIdentificationAlgorithm.h:137
Size n_external_features_
Definition: FeatureFinderIdentificationAlgorithm.h:232
predicate for filtering features by assigned peptides:
Definition: FeatureFinderIdentificationAlgorithm.h:171
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Size batch_size_
nr of peptides to use at the same time during chromatogram extraction
Definition: FeatureFinderIdentificationAlgorithm.h:121
bool mz_window_ppm_
m/z window width is given in PPM (not Da)?
Definition: FeatureFinderIdentificationAlgorithm.h:124
PeakMap ms_data_
input LC-MS data
Definition: FeatureFinderIdentificationAlgorithm.h:216
std::map< AASequence, ChargeMap > PeptideMap
mapping: sequence -> charge -> internal/external ID information
Definition: FeatureFinderIdentificationAlgorithm.h:112
double psm_score_cutoff_
Definition: FeatureFinderIdentificationAlgorithm.h:222
double rt_quantile_
Definition: FeatureFinderIdentificationAlgorithm.h:131
std::vector< std::pair< It, It > > chunk_(It range_from, It range_to, const std::ptrdiff_t batch_size)
Definition: FeatureFinderIdentificationAlgorithm.h:296
String svm_xval_out_
Definition: FeatureFinderIdentificationAlgorithm.h:142
double isotope_pmin_
min. isotope probability for peptide assay
Definition: FeatureFinderIdentificationAlgorithm.h:128
double rt_window_
RT window width.
Definition: FeatureFinderIdentificationAlgorithm.h:122
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
std::map< double, std::pair< Size, Size > > svm_probs_internal_
SVM probability -> number of pos./neg. features (for FDR calculation):
Definition: FeatureFinderIdentificationAlgorithm.h:228
MRMFeatureFinderScoring feat_finder_
OpenSWATH feature finder.
Definition: FeatureFinderIdentificationAlgorithm.h:236
ProgressLogger prog_log_
Definition: FeatureFinderIdentificationAlgorithm.h:238
Size svm_n_parts_
number of partitions for SVM cross-validation
Definition: FeatureFinderIdentificationAlgorithm.h:144
std::map< String, double > isotope_probs_
isotope probabilities of transitions
Definition: FeatureFinderIdentificationAlgorithm.h:235
predicate for filtering features by overall quality:
Definition: FeatureFinderIdentificationAlgorithm.h:162
An LC-MS feature.
Definition: Feature.h:70
double peak_width_
Definition: FeatureFinderIdentificationAlgorithm.h:133
double start
Definition: FeatureFinderIdentificationAlgorithm.h:157
String candidates_out_
Definition: FeatureFinderIdentificationAlgorithm.h:148
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
TargetedExperiment library_
accumulated assays for peptides
Definition: FeatureFinderIdentificationAlgorithm.h:218
ChargeMap ids
internal/external peptide IDs (per charge) in this region
Definition: FeatureFinderIdentificationAlgorithm.h:158
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:70
CoordinateType getRT() const
Returns the RT coordinate (index 0)
Definition: Peak2D.h:210
bool quantify_decoys_
Definition: FeatureFinderIdentificationAlgorithm.h:220
PeptideMap peptide_map_
Definition: FeatureFinderIdentificationAlgorithm.h:116
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
FeatureFinderAlgorithmPickedHelperStructs::MassTrace MassTrace
Definition: FeatureFinderIdentificationAlgorithm.h:104
comparison functor for (unassigned) peptide IDs
Definition: FeatureFinderIdentificationAlgorithm.h:180
comparison functor for features
Definition: FeatureFinderIdentificationAlgorithm.h:202
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:52
bool operator()(const Feature &feature)
Definition: FeatureFinderIdentificationAlgorithm.h:164
A description of a targeted experiment containing precursor and production ions.
Definition: TargetedExperiment.h:64
Size n_external_peps_
number of external peptides
Definition: FeatureFinderIdentificationAlgorithm.h:119
double signal_to_noise_
Definition: FeatureFinderIdentificationAlgorithm.h:135
double svm_min_prob_
Definition: FeatureFinderIdentificationAlgorithm.h:140
Size svm_n_samples_
number of samples for SVM training
Definition: FeatureFinderIdentificationAlgorithm.h:145
Generic description of a coordinate transformation.
Definition: TransformationDescription.h:62
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
std::map< Int, std::pair< RTMap, RTMap > > ChargeMap
mapping: charge -> internal/external: (RT -> pointer to peptide)
Definition: FeatureFinderIdentificationAlgorithm.h:110
std::map< String, std::pair< RTMap, RTMap > > PeptideRefRTMap
mapping: peptide ref. -> int./ext.: (RT -> pointer to peptide)
Definition: FeatureFinderIdentificationAlgorithm.h:114
int Int
Signed integer type.
Definition: Types.h:102
double mz_window_
m/z window width
Definition: FeatureFinderIdentificationAlgorithm.h:123
std::multiset< double > svm_probs_external_
SVM probabilities for "external" features (for FDR calculation):
Definition: FeatureFinderIdentificationAlgorithm.h:230
Size debug_level_
Definition: FeatureFinderIdentificationAlgorithm.h:150
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
Helper struct for mass traces used in FeatureFinderAlgorithmPicked.
Definition: FeatureFinderAlgorithmPickedHelperStructs.h:79
Represents a peptide (amino acid sequence)
Definition: TargetedExperimentHelper.h:358
bool use_psm_cutoff_
Definition: FeatureFinderIdentificationAlgorithm.h:221