OpenMS  2.4.0
FalseDiscoveryRate.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
40 
41 #include <vector>
42 #include <unordered_set>
43 
44 namespace OpenMS
45 {
65  class OPENMS_DLLAPI FalseDiscoveryRate :
66  public DefaultParamHandler
67  {
68 public:
71 
78  void apply(std::vector<PeptideIdentification> & fwd_ids, std::vector<PeptideIdentification> & rev_ids) const;
79 
85  void apply(std::vector<PeptideIdentification> & id) const;
86 
93  void apply(std::vector<ProteinIdentification>& fwd_ids, std::vector<ProteinIdentification>& rev_ids) const;
94 
100  void apply(std::vector<ProteinIdentification>& ids) const;
101 
107  void applyEstimated(std::vector<ProteinIdentification>& ids) const;
108 
118  double applyEvaluateProteinIDs(const std::vector<ProteinIdentification>& ids, double pepCutoff = 1.0, UInt fpCutoff = 50, double diffWeight = 0.2);
119  double applyEvaluateProteinIDs(const ProteinIdentification& ids, double pepCutoff = 1.0, UInt fpCutoff = 50, double diffWeight = 0.2);
120 
122  void applyBasic(std::vector<PeptideIdentification> & ids);
123  void applyBasic(ProteinIdentification & id, bool groups_too = true);
124 
127  double rocN(const std::vector<PeptideIdentification>& ids, Size fp_cutoff) const;
128 
129 
130 private:
131 
134 
136  FalseDiscoveryRate & operator=(const FalseDiscoveryRate &);
137 
138  //TODO we could add identifier here. If we need to combine runs.
139  void getScores_(
140  std::vector<std::pair<double,bool>>& scores_labels,
141  const ProteinIdentification & id) const;
142 
143  void getScores_(
144  std::vector<std::pair<double,bool>>& scores_labels,
145  const std::vector<ProteinIdentification::ProteinGroup> & grps,
146  const std::unordered_set<std::string> & decoy_accs) const;
147 
148  void getScores_(
149  std::vector<std::pair<double,bool>>& scores_labels,
150  const std::vector<PeptideIdentification> & ids,
151  bool all_hits,
152  int charge, String identifier) const;
153 
154  void getScores_(
155  std::vector<std::pair<double,bool>>& scores_labels,
156  const std::vector<PeptideIdentification> & targets,
157  const std::vector<PeptideIdentification> & decoys,
158  bool all_hits,
159  int charge,
160  const String& identifier) const;
161 
162  void setScores_(
163  const std::map<double,double>& scores_to_FDR,
164  std::vector<PeptideIdentification> & id,
165  const std::string& score_type,
166  bool higher_better) const;
167 
168  template <typename IDType>
169  void setScores_(const std::map<double,double>& scores_to_FDR, IDType & id, const std::string& score_type, bool higher_better) const
170  {
171  String old_score_type = id.getScoreType() + "_score";
172  id.setScoreType(score_type);
173  id.setHigherScoreBetter(higher_better);
174  for (auto& hit : id.getHits())
175  {
176  double old_score = hit.getScore();
177  hit.setScore(scores_to_FDR.lower_bound(hit.getScore())->second);
178  hit.setMetaValue(old_score_type, old_score);
179  }
180  }
181 
182  void setScores_(
183  const std::map<double,double>& scores_to_FDR,
184  std::vector<ProteinIdentification::ProteinGroup>& grps,
185  const std::string& score_type,
186  bool higher_better) const;
187 
188  template <typename IDType>
189  void checkTDAnnotation_ (const IDType & id) const
190  {
191  for (auto const& hit : id.getHits())
192  {
193  if (!hit.metaValueExists("target_decoy"))
194  {
195  throw Exception::MissingInformation(__FILE__,
196  __LINE__,
197  OPENMS_PRETTY_FUNCTION,
198  "Meta value 'target_decoy' does not exist in all ProteinHits! Reindex the idXML file with 'PeptideIndexer'");
199  }
200  }
201  }
202 
203  template <typename HitType>
204  struct GetLabelFunctor: std::function<bool(const HitType&)>
205  {
206  bool operator() (const HitType& hit)
207  {
208  //TODO if we checked in the beginning, this check could be skipped.
209  if (!hit.metaValueExists("target_decoy"))
210  {
211  throw Exception::MissingInformation(__FILE__,
212  __LINE__,
213  OPENMS_PRETTY_FUNCTION,
214  "Meta value 'target_decoy' does not exist in all ProteinHits! Reindex the idXML file with 'PeptideIndexer'");
215  }
216  else
217  {
218  return std::string(hit.getMetaValue("target_decoy"))[0] == 't';
219  }
220  }
221  };
222 
223  template <typename HitType>
224  struct TrueFunctor: std::function<bool(const HitType&)>
225  {
226  bool operator() (const HitType& /*hit*/)
227  {
228  return true;
229  }
230  };
231 
232  template <typename HitType>
233  struct FalseFunctor: std::function<bool(const HitType&)>
234  {
235  bool operator() (const HitType& /*hit*/)
236  {
237  return false;
238  }
239  };
240 
241 
242  template <typename HitType>
243  std::pair<double,bool> getScoreLabel_(const HitType& hit, std::function<bool(const HitType&)> fun) const
244  {
245  return std::make_pair(hit.getScore(), fun(hit));
246  }
247 
249  void calculateFDRs_(Map<double, double>& score_to_fdr, std::vector<double>& target_scores, std::vector<double>& decoy_scores, bool q_value, bool higher_score_better) const;
250 
253  void calculateEstimatedQVal_(std::map<double, double> &scores_to_FDR,
254  std::vector<std::pair<double, bool>> &scores_labels,
255  bool higher_score_better) const;
256 
258  void calculateFDRBasic_(std::map<double,double>& scores_to_FDR, std::vector<std::pair<double,bool>>& scores_labels, bool qvalue, bool higher_score_better);
259 
260  //TODO the next two methods could potentially be merged for speed (they iterate over the same structure)
261  //But since they have different cutoff types and it is more generic, I leave it like this.
263  double diffEstimatedEmpirical_(const std::vector<std::pair<double, bool>>& scores_labels, double pepCutoff = 1.0);
264 
267  double rocN_(std::vector<std::pair<double, bool>> const &scores_labels, Size fpCutoff = 50) const;
268 
271  double trapezoidal_area_xEqy(double exp1, double exp2, double act1, double act2) const;
272 
274  double trapezoidal_area(double x1, double x2, double y1, double y2) const;
275 
276 
277  };
278 
279 } // namespace OpenMS
280 
Representation of a protein identification run.
Definition: ProteinIdentification.h:68
A more convenient string class.
Definition: String.h:58
void checkTDAnnotation_(const IDType &id) const
Definition: FalseDiscoveryRate.h:189
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
std::pair< double, bool > getScoreLabel_(const HitType &hit, std::function< bool(const HitType &)> fun) const
Definition: FalseDiscoveryRate.h:243
Definition: FalseDiscoveryRate.h:233
Definition: FalseDiscoveryRate.h:204
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Definition: FalseDiscoveryRate.h:224
Calculates an FDR from identifications.
Definition: FalseDiscoveryRate.h:65
void setScores_(const std::map< double, double > &scores_to_FDR, IDType &id, const std::string &score_type, bool higher_better) const
Definition: FalseDiscoveryRate.h:169
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:91
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:50
Not all required information provided.
Definition: Exception.h:195