OpenMS
PosteriorErrorProbabilityModel.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: David Wojnar $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <vector>
45 #include <map>
46 
47 namespace OpenMS
48 {
49  class String;
50  class TextFile;
51  class PeptideIdentification;
52  class ProteinIdentification;
53  class PeptideHit;
54  namespace Math
55  {
56 
57 
74  class OPENMS_DLLAPI PosteriorErrorProbabilityModel :
75  public DefaultParamHandler
76  {
77 public:
78 
81 
84 
96  static std::map<String, std::vector<std::vector<double>>> extractAndTransformScores(
97  const std::vector<ProteinIdentification> & protein_ids,
98  const std::vector<PeptideIdentification> & peptide_ids,
99  const bool split_charge,
100  const bool top_hits_only,
101  const bool target_decoy_available,
102  const double fdr_for_targets_smaller);
103 
117  static void updateScores(
118  const PosteriorErrorProbabilityModel & PEP_model,
119  const String & search_engine,
120  const Int charge,
121  const bool prob_correct,
122  const bool split_charge,
123  std::vector<ProteinIdentification> & protein_ids,
124  std::vector<PeptideIdentification> & peptide_ids,
125  bool & unable_to_fit_data,
126  bool & data_might_not_be_well_fit);
127 
136  bool fit(std::vector<double> & search_engine_scores, const String& outlier_handling);
137 
146  bool fitGumbelGauss(std::vector<double>& search_engine_scores, const String& outlier_handling);
147 
155  bool fit(std::vector<double> & search_engine_scores, std::vector<double> & probabilities, const String& outlier_handling);
156 
158  void fillDensities(const std::vector<double> & x_scores, std::vector<double> & incorrect_density, std::vector<double> & correct_density);
160  void fillLogDensities(const std::vector<double> & x_scores, std::vector<double> & incorrect_density, std::vector<double> & correct_density);
162  void fillLogDensitiesGumbel(const std::vector<double> & x_scores, std::vector<double> & incorrect_density, std::vector<double> & correct_density);
164  double computeLogLikelihood(const std::vector<double> & incorrect_density, const std::vector<double> & correct_density) const;
165 
171  const std::vector<double>& incorrect_log_density,
172  const std::vector<double>& correct_log_density,
173  std::vector<double>& incorrect_posterior) const;
174 
181  std::pair<double, double> pos_neg_mean_weighted_posteriors(const std::vector<double> &x_scores,
182  const std::vector<double> &incorrect_posteriors);
183 
190  std::pair<double, double> pos_neg_sigma_weighted_posteriors(const std::vector<double> &x_scores,
191  const std::vector<double> &incorrect_posteriors,
192  const std::pair<double, double>& means);
193 
196  {
197  return correctly_assigned_fit_param_;
198  }
199 
202  {
203  return incorrectly_assigned_fit_param_;
204  }
205 
208  {
209  return incorrectly_assigned_fit_gumbel_param_;
210  }
211 
213  double getNegativePrior() const
214  {
215  return negative_prior_;
216  }
217 
219  static double getGumbel_(double x, const GaussFitter::GaussFitResult & params)
220  {
221  double z = exp((params.x0 - x) / params.sigma);
222  return (z * exp(-1 * z)) / params.sigma;
223  }
224 
229  double computeProbability(double score) const;
230 
232  TextFile initPlots(std::vector<double> & x_scores);
233 
236 
239 
242 
244  void plotTargetDecoyEstimation(std::vector<double> & target, std::vector<double> & decoy);
245 
247  inline double getSmallestScore() const
248  {
249  return smallest_score_;
250  }
251 
253  void tryGnuplot(const String& gp_file);
254 
255 private:
257  void processOutliers_(std::vector<double>& x_scores, const String& outlier_handling) const;
258 
263  static double transformScore_(const String& engine, const PeptideHit& hit, const String& current_score_type);
264 
269  static double getScore_(const std::vector<String>& requested_score_types, const PeptideHit & hit, const String& actual_score_type);
270 
289  const String (PosteriorErrorProbabilityModel::* getNegativeGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;
291  const String (PosteriorErrorProbabilityModel::* getPositiveGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;
292  };
293  }
294 }
295 
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
Implements a mixture model of the inverse gumbel and the gauss distribution or a gaussian mixture.
Definition: PosteriorErrorProbabilityModel.h:76
std::pair< double, double > pos_neg_mean_weighted_posteriors(const std::vector< double > &x_scores, const std::vector< double > &incorrect_posteriors)
double computeLLAndIncorrectPosteriorsFromLogDensities(const std::vector< double > &incorrect_log_density, const std::vector< double > &correct_log_density, std::vector< double > &incorrect_posterior) const
const String getBothGnuplotFormula(const GaussFitter::GaussFitResult &incorrect, const GaussFitter::GaussFitResult &correct) const
returns the gnuplot formula of the fitted mixture distribution.
GumbelMaxLikelihoodFitter::GumbelDistributionFitResult incorrectly_assigned_fit_gumbel_param_
Definition: PosteriorErrorProbabilityModel.h:277
PosteriorErrorProbabilityModel & operator=(const PosteriorErrorProbabilityModel &rhs)
assignment operator (not implemented)
TextFile initPlots(std::vector< double > &x_scores)
initializes the plots
const String getGaussGnuplotFormula(const GaussFitter::GaussFitResult &params) const
returns the gnuplot formula of the fitted gauss distribution.
void plotTargetDecoyEstimation(std::vector< double > &target, std::vector< double > &decoy)
plots the estimated distribution against target and decoy hits
static double transformScore_(const String &engine, const PeptideHit &hit, const String &current_score_type)
GaussFitter::GaussFitResult incorrectly_assigned_fit_param_
stores parameters for incorrectly assigned sequences. If gumbel fit was used, A can be ignored....
Definition: PosteriorErrorProbabilityModel.h:276
double max_correctly_
peak of the gauss distribution (correctly assigned sequences)
Definition: PosteriorErrorProbabilityModel.h:285
double computeProbability(double score) const
void fillDensities(const std::vector< double > &x_scores, std::vector< double > &incorrect_density, std::vector< double > &correct_density)
Writes the distributions densities into the two vectors for a set of scores. Incorrect_densities repr...
static std::map< String, std::vector< std::vector< double > > > extractAndTransformScores(const std::vector< ProteinIdentification > &protein_ids, const std::vector< PeptideIdentification > &peptide_ids, const bool split_charge, const bool top_hits_only, const bool target_decoy_available, const double fdr_for_targets_smaller)
extract and transform score types to a range and score orientation that the PEP model can handle
bool fit(std::vector< double > &search_engine_scores, const String &outlier_handling)
fits the distributions to the data points(search_engine_scores). Estimated parameters for the distrib...
PosteriorErrorProbabilityModel(const PosteriorErrorProbabilityModel &rhs)
Copy constructor (not implemented)
bool fit(std::vector< double > &search_engine_scores, std::vector< double > &probabilities, const String &outlier_handling)
fits the distributions to the data points(search_engine_scores) and writes the computed probabilities...
static double getScore_(const std::vector< String > &requested_score_types, const PeptideHit &hit, const String &actual_score_type)
GaussFitter::GaussFitResult getIncorrectlyAssignedFitResult() const
returns estimated parameters for correctly assigned sequences. Fit should be used before.
Definition: PosteriorErrorProbabilityModel.h:201
double getNegativePrior() const
returns the estimated negative prior probability.
Definition: PosteriorErrorProbabilityModel.h:213
const String getGumbelGnuplotFormula(const GaussFitter::GaussFitResult &params) const
returns the gnuplot formula of the fitted gumbel distribution. Only x0 and sigma are used as local pa...
void fillLogDensities(const std::vector< double > &x_scores, std::vector< double > &incorrect_density, std::vector< double > &correct_density)
Writes the log distributions densities into the two vectors for a set of scores. Incorrect_densities ...
double negative_prior_
stores final prior probability for negative peptides
Definition: PosteriorErrorProbabilityModel.h:281
void fillLogDensitiesGumbel(const std::vector< double > &x_scores, std::vector< double > &incorrect_density, std::vector< double > &correct_density)
Writes the log distributions of gumbel and gauss densities into the two vectors for a set of scores....
~PosteriorErrorProbabilityModel() override
Destructor.
void tryGnuplot(const String &gp_file)
try to invoke 'gnuplot' on the file to create PDF automatically
void processOutliers_(std::vector< double > &x_scores, const String &outlier_handling) const
transform different score types to a range and score orientation that the model can handle (engine st...
GaussFitter::GaussFitResult correctly_assigned_fit_param_
stores gauss parameters
Definition: PosteriorErrorProbabilityModel.h:279
double max_incorrectly_
peak of the incorrectly assigned sequences distribution
Definition: PosteriorErrorProbabilityModel.h:283
static void updateScores(const PosteriorErrorProbabilityModel &PEP_model, const String &search_engine, const Int charge, const bool prob_correct, const bool split_charge, std::vector< ProteinIdentification > &protein_ids, std::vector< PeptideIdentification > &peptide_ids, bool &unable_to_fit_data, bool &data_might_not_be_well_fit)
update score entries with PEP (or 1-PEP) estimates
PosteriorErrorProbabilityModel()
default constructor
GaussFitter::GaussFitResult getCorrectlyAssignedFitResult() const
returns estimated parameters for correctly assigned sequences. Fit should be used before.
Definition: PosteriorErrorProbabilityModel.h:195
static double getGumbel_(double x, const GaussFitter::GaussFitResult &params)
computes the gumbel density at position x with parameters params.
Definition: PosteriorErrorProbabilityModel.h:219
double smallest_score_
smallest score which was used for fitting the model
Definition: PosteriorErrorProbabilityModel.h:287
double getSmallestScore() const
returns the smallest score used in the last fit
Definition: PosteriorErrorProbabilityModel.h:247
bool fitGumbelGauss(std::vector< double > &search_engine_scores, const String &outlier_handling)
fits the distributions to the data points(search_engine_scores). Estimated parameters for the distrib...
std::pair< double, double > pos_neg_sigma_weighted_posteriors(const std::vector< double > &x_scores, const std::vector< double > &incorrect_posteriors, const std::pair< double, double > &means)
GumbelMaxLikelihoodFitter::GumbelDistributionFitResult getIncorrectlyAssignedGumbelFitResult() const
returns estimated parameters for correctly assigned sequences. Fit should be used before.
Definition: PosteriorErrorProbabilityModel.h:207
double computeLogLikelihood(const std::vector< double > &incorrect_density, const std::vector< double > &correct_density) const
computes the Likelihood with a log-likelihood function.
Representation of a peptide hit.
Definition: PeptideHit.h:57
A more convenient string class.
Definition: String.h:60
This class provides some basic file handling methods for text files.
Definition: TextFile.h:47
int Int
Signed integer type.
Definition: Types.h:102
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
struct of parameters of a Gaussian distribution
Definition: GaussFitter.h:66
double sigma
parameter sigma of Gaussian distribution (width)
Definition: GaussFitter.h:80
double x0
parameter x0 of Gaussian distribution (center position)
Definition: GaussFitter.h:77
struct to represent the parameters of a gumbel distribution
Definition: GumbelMaxLikelihoodFitter.h:64