OpenMS
DBSuitability.h
Go to the documentation of this file.
1 // Copyright (c) 2002-2023, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Tom Waschischeck $
6 // $Authors: Tom Waschischeck $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
11 #include <OpenMS/CONCEPT/Types.h>
15 
16 #include <cfloat>
17 #include <vector>
18 
19 #include <boost/regex.hpp>
20 
21 namespace OpenMS
22 {
23  class ParamXMLFile;
24  class PeptideIdentification;
25  class PeptideHit;
26  class MSExperiment;
27 
45  class OPENMS_DLLAPI DBSuitability:
46  public DefaultParamHandler
47  {
48  public:
50  struct OPENMS_DLLAPI SuitabilityData
51  {
53  Size num_top_novo = 0;
54 
56  Size num_top_db = 0;
57 
59  Size num_interest = 0;
60 
63  Size num_re_ranked = 0;
64 
67  double cut_off = DBL_MAX;
68 
78  double suitability = 0;
79 
82  double suitability_no_rerank = 0;
83 
85  double suitability_corr_no_rerank = 0;
86 
87  // resets all members to their defaults
88  void clear();
89 
92  void setCorrectionFactor(double factor);
93 
94  double getCorrectionFactor() const;
95 
96  double getCorrectedNovoHits() const;
97 
98  double getCorrectedSuitability() const;
99 
109 
110  private:
116  double corr_factor = -1;
117 
119  double num_top_novo_corr = 0;
120 
126  double suitability_corr = 0;
127  };
128 
133 
135  ~DBSuitability() override = default;
136 
138  friend class DBSuitability_friend;
139 
203  void compute(std::vector<PeptideIdentification>&& pep_ids, const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& original_fasta, const std::vector<FASTAFile::FASTAEntry>& novo_fasta, const ProteinIdentification::SearchParameters& search_params);
204 
214  const std::vector<SuitabilityData>& getResults() const;
215 
216  private:
218  std::vector<SuitabilityData> results_;
219 
221  const boost::regex decoy_pattern_;
222 
237  double getDecoyDiff_(const PeptideIdentification& pep_id) const;
238 
253  double getDecoyCutOff_(const std::vector<PeptideIdentification>& pep_ids, double reranking_cutoff_percentile) const;
254 
268  bool isNovoHit_(const PeptideHit& hit) const;
269 
278  bool checkScoreBetterThanThreshold_(const PeptideHit& hit, double threshold, bool higher_score_better) const;
279 
290  std::pair<String, Param> extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters& meta_values) const;
291 
299  void writeIniFile_(const Param& parameters, const String& filename) const;
300 
323  std::vector<PeptideIdentification> runIdentificationSearch_(const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& fasta_data, const String& adapter_name, Param& parameters) const;
324 
335  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta_(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate) const;
336 
353  void calculateSuitability_(const std::vector<PeptideIdentification>& pep_ids, SuitabilityData& data) const;
354 
364  void appendDecoys_(std::vector<FASTAFile::FASTAEntry>& fasta) const;
365 
373  double extractScore_(const PeptideHit& pep_hit) const;
374 
388  double calculateCorrectionFactor_(const SuitabilityData& data, const SuitabilityData& data_sampled, double sampling_rate) const;
389 
398  UInt numberOfUniqueProteins_(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1) const;
399 
408  Size getIndexWithMedianNovoHits_(const std::vector<SuitabilityData>& data) const;
409 
426  double getScoreMatchingFDR_(const std::vector<PeptideIdentification>& pep_ids, double FDR, const String& score_name, bool higher_score_better) const;
427  };
428 
429  // friend class to test private member functions
431  {
432  public:
433  DBSuitability_friend() = default;
434 
436 
437  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate)
438  {
439  return suit_.getSubsampledFasta_(fasta_data, subsampling_rate);
440  }
441 
442  void appendDecoys(std::vector<FASTAFile::FASTAEntry>& fasta)
443  {
444  suit_.appendDecoys_(fasta);
445  }
446 
447  double calculateCorrectionFactor(const DBSuitability::SuitabilityData& data, const DBSuitability::SuitabilityData& data_sampled, double sampling_rate)
448  {
449  return suit_.calculateCorrectionFactor_(data, data_sampled, sampling_rate);
450  }
451 
452  UInt numberOfUniqueProteins(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1)
453  {
454  return suit_.numberOfUniqueProteins_(peps, number_of_hits);
455  }
456 
457  Size getIndexWithMedianNovoHits(const std::vector<DBSuitability::SuitabilityData>& data)
458  {
459  return suit_.getIndexWithMedianNovoHits_(data);
460  }
461 
462  double getScoreMatchingFDR(const std::vector<PeptideIdentification>& pep_ids, double FDR, String score_name, bool higher_score_better)
463  {
464  return suit_.getScoreMatchingFDR_(pep_ids, FDR, score_name, higher_score_better);
465  }
466 
467  /* Not tested:
468  getDecoyDiff_, getDecoyCutOff_, isNovoHit_, checkScoreBetterThanThreshold_
469  Reason: These functions are essential to the normal suitability calculation and if something would not work, the test for 'compute' would fail.
470 
471  extractSearchAdapterInfoFromMetaValues_, writeIniFile_, extractScore_
472  Reason: These functions are very straightforeward.
473 
474  runIdentificationSearch_
475  Reason: This function simulates a whole workflow and testing it would be to complicated.
476  */
477 
478  private:
480  };
481 }
482 
Definition: DBSuitability.h:431
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate)
Definition: DBSuitability.h:437
UInt numberOfUniqueProteins(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1)
Definition: DBSuitability.h:452
double calculateCorrectionFactor(const DBSuitability::SuitabilityData &data, const DBSuitability::SuitabilityData &data_sampled, double sampling_rate)
Definition: DBSuitability.h:447
DBSuitability suit_
Definition: DBSuitability.h:479
double getScoreMatchingFDR(const std::vector< PeptideIdentification > &pep_ids, double FDR, String score_name, bool higher_score_better)
Definition: DBSuitability.h:462
Size getIndexWithMedianNovoHits(const std::vector< DBSuitability::SuitabilityData > &data)
Definition: DBSuitability.h:457
void appendDecoys(std::vector< FASTAFile::FASTAEntry > &fasta)
Definition: DBSuitability.h:442
This class holds the functionality of calculating the database suitability.
Definition: DBSuitability.h:47
void compute(std::vector< PeptideIdentification > &&pep_ids, const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &original_fasta, const std::vector< FASTAFile::FASTAEntry > &novo_fasta, const ProteinIdentification::SearchParameters &search_params)
Computes suitability of a database used to search a mzML.
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta_(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate) const
Creates a subsampled fasta with the given subsampling rate.
bool checkScoreBetterThanThreshold_(const PeptideHit &hit, double threshold, bool higher_score_better) const
Tests if a PeptideHit has a score better than the given threshold.
Size getIndexWithMedianNovoHits_(const std::vector< SuitabilityData > &data) const
Finds the SuitabilityData object with the median number of de novo hits.
std::vector< PeptideIdentification > runIdentificationSearch_(const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &fasta_data, const String &adapter_name, Param &parameters) const
Executes the workflow from search adapter, followed by PeptideIndexer and finishes with FDR.
void writeIniFile_(const Param &parameters, const String &filename) const
Writes parameters into a given file.
std::pair< String, Param > extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters &meta_values) const
Looks through meta values of SearchParameters to find out which search adapter was used.
UInt numberOfUniqueProteins_(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1) const
Determines the number of unique proteins found in the protein accessions of PeptideIdentifications.
~DBSuitability() override=default
Destructor.
double getDecoyCutOff_(const std::vector< PeptideIdentification > &pep_ids, double reranking_cutoff_percentile) const
Calculates a xcorr cut-off based on decoy hits.
double calculateCorrectionFactor_(const SuitabilityData &data, const SuitabilityData &data_sampled, double sampling_rate) const
Calculates the correction factor from two suitability calculations.
const std::vector< SuitabilityData > & getResults() const
Returns results calculated by this metric.
std::vector< SuitabilityData > results_
result vector
Definition: DBSuitability.h:218
void appendDecoys_(std::vector< FASTAFile::FASTAEntry > &fasta) const
Calculates and appends decoys to a given vector of FASTAEntry.
void calculateSuitability_(const std::vector< PeptideIdentification > &pep_ids, SuitabilityData &data) const
Calculates all suitability data from a combined deNovo+database search.
double getScoreMatchingFDR_(const std::vector< PeptideIdentification > &pep_ids, double FDR, const String &score_name, bool higher_score_better) const
Extracts the worst score that still passes a FDR (q-value) threshold.
bool isNovoHit_(const PeptideHit &hit) const
Tests if a PeptideHit is considered a deNovo hit.
const boost::regex decoy_pattern_
pattern for finding a decoy string
Definition: DBSuitability.h:221
double extractScore_(const PeptideHit &pep_hit) const
Returns the cross correlation score normalized by MW (if existing), else if the 'force' flag is set t...
double getDecoyDiff_(const PeptideIdentification &pep_id) const
Calculates the xcorr difference between the top two hits marked as decoy.
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:66
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:46
Management and storage of parameters / INI files.
Definition: Param.h:44
Representation of a peptide hit.
Definition: PeptideHit.h:31
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:39
A more convenient string class.
Definition: String.h:34
unsigned int UInt
Unsigned integer type.
Definition: Types.h:68
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:101
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:22
struct to store results
Definition: DBSuitability.h:51
SuitabilityData simulateNoReRanking() const
Returns a SuitabilityData object containing the data if re-ranking didn't happen.
Search parameters of the DB search.
Definition: ProteinIdentification.h:247