OpenMS  2.8.0
DBSuitability.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Tom Waschischeck $
32 // $Authors: Tom Waschischeck $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/CONCEPT/Types.h>
41 
42 #include <cfloat>
43 #include <map>
44 #include <vector>
45 
46 namespace OpenMS
47 {
48  class ParamXMLFile;
49  class PeptideIdentification;
50  class PeptideHit;
51  class MSExperiment;
52 
70  class OPENMS_DLLAPI DBSuitability:
71  public DefaultParamHandler
72  {
73  public:
75  struct OPENMS_DLLAPI SuitabilityData
76  {
78  Size num_top_novo = 0;
79 
81  Size num_top_db = 0;
82 
84  Size num_interest = 0;
85 
88  Size num_re_ranked = 0;
89 
92  double cut_off = DBL_MAX;
93 
103  double suitability = 0;
104 
107  double suitability_no_rerank = 0;
108 
110  double suitability_corr_no_rerank = 0;
111 
112  // resets all members to their defaults
113  void clear();
114 
117  void setCorrectionFactor(double factor);
118 
119  double getCorrectionFactor() const;
120 
121  double getCorrectedNovoHits() const;
122 
123  double getCorrectedSuitability() const;
124 
134 
135  private:
141  double corr_factor;
142 
144  double num_top_novo_corr = 0;
145 
151  double suitability_corr = 0;
152  };
153 
158 
160  ~DBSuitability() override = default;
161 
163  friend class DBSuitability_friend;
164 
228  void compute(std::vector<PeptideIdentification>&& pep_ids, const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& original_fasta, const std::vector<FASTAFile::FASTAEntry>& novo_fasta, const ProteinIdentification::SearchParameters& search_params);
229 
239  const std::vector<SuitabilityData>& getResults() const;
240 
241  private:
243  std::vector<SuitabilityData> results_;
244 
246  const boost::regex decoy_pattern_;
247 
262  double getDecoyDiff_(const PeptideIdentification& pep_id) const;
263 
278  double getDecoyCutOff_(const std::vector<PeptideIdentification>& pep_ids, double reranking_cutoff_percentile) const;
279 
293  bool isNovoHit_(const PeptideHit& hit) const;
294 
303  bool checkScoreBetterThanThreshold_(const PeptideHit& hit, double threshold, bool higher_score_better) const;
304 
315  std::pair<String, Param> extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters& search_params) const;
316 
324  void writeIniFile_(const Param& parameters, const String& filename) const;
325 
348  std::vector<PeptideIdentification> runIdentificationSearch_(const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& fasta_data, const String& adapter_name, Param& parameters) const;
349 
360  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta_(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate) const;
361 
378  void calculateSuitability_(const std::vector<PeptideIdentification>& pep_ids, SuitabilityData& data) const;
379 
389  void appendDecoys_(std::vector<FASTAFile::FASTAEntry>& fasta) const;
390 
398  double extractScore_(const PeptideHit& pep_hit) const;
399 
413  double calculateCorrectionFactor_(const SuitabilityData& data, const SuitabilityData& data_sampled, double sampling_rate) const;
414 
423  UInt numberOfUniqueProteins_(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1) const;
424 
433  Size getIndexWithMedianNovoHits_(const std::vector<SuitabilityData>& data) const;
434 
451  double getScoreMatchingFDR_(const std::vector<PeptideIdentification>& pep_ids, double FDR, String score_name, bool higher_score_better) const;
452  };
453 
454  // friend class to test private member functions
456  {
457  public:
458  DBSuitability_friend() = default;
459 
461 
462  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate)
463  {
464  return suit_.getSubsampledFasta_(fasta_data, subsampling_rate);
465  }
466 
467  void appendDecoys(std::vector<FASTAFile::FASTAEntry>& fasta)
468  {
469  suit_.appendDecoys_(fasta);
470  }
471 
472  double calculateCorrectionFactor(const DBSuitability::SuitabilityData& data, const DBSuitability::SuitabilityData& data_sampled, double sampling_rate)
473  {
474  return suit_.calculateCorrectionFactor_(data, data_sampled, sampling_rate);
475  }
476 
477  UInt numberOfUniqueProteins(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1)
478  {
479  return suit_.numberOfUniqueProteins_(peps, number_of_hits);
480  }
481 
482  Size getIndexWithMedianNovoHits(const std::vector<DBSuitability::SuitabilityData>& data)
483  {
484  return suit_.getIndexWithMedianNovoHits_(data);
485  }
486 
487  double getScoreMatchingFDR(const std::vector<PeptideIdentification>& pep_ids, double FDR, String score_name, bool higher_score_better)
488  {
489  return suit_.getScoreMatchingFDR_(pep_ids, FDR, score_name, higher_score_better);
490  }
491 
492  /* Not tested:
493  getDecoyDiff_, getDecoyCutOff_, isNovoHit_, checkScoreBetterThanThreshold_
494  Reason: These functions are essential to the normal suitability calculation and if something would not work, the test for 'compute' would fail.
495 
496  extractSearchAdapterInfoFromMetaValues_, writeIniFile_, extractScore_
497  Reason: These functions are very straightforeward.
498 
499  runIdentificationSearch_
500  Reason: This function simulates a whole workflow and testing it would be to complicated.
501  */
502 
503  private:
505  };
506 }
507 
Definition: DBSuitability.h:456
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate)
Definition: DBSuitability.h:462
UInt numberOfUniqueProteins(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1)
Definition: DBSuitability.h:477
double calculateCorrectionFactor(const DBSuitability::SuitabilityData &data, const DBSuitability::SuitabilityData &data_sampled, double sampling_rate)
Definition: DBSuitability.h:472
DBSuitability suit_
Definition: DBSuitability.h:504
double getScoreMatchingFDR(const std::vector< PeptideIdentification > &pep_ids, double FDR, String score_name, bool higher_score_better)
Definition: DBSuitability.h:487
Size getIndexWithMedianNovoHits(const std::vector< DBSuitability::SuitabilityData > &data)
Definition: DBSuitability.h:482
void appendDecoys(std::vector< FASTAFile::FASTAEntry > &fasta)
Definition: DBSuitability.h:467
This class holds the functionality of calculating the database suitability.
Definition: DBSuitability.h:72
void compute(std::vector< PeptideIdentification > &&pep_ids, const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &original_fasta, const std::vector< FASTAFile::FASTAEntry > &novo_fasta, const ProteinIdentification::SearchParameters &search_params)
Computes suitability of a database used to search a mzML.
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta_(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate) const
Creates a subsampled fasta with the given subsampling rate.
bool checkScoreBetterThanThreshold_(const PeptideHit &hit, double threshold, bool higher_score_better) const
Tests if a PeptideHit has a score better than the given threshold.
Size getIndexWithMedianNovoHits_(const std::vector< SuitabilityData > &data) const
Finds the SuitabilityData object with the median number of de novo hits.
std::vector< PeptideIdentification > runIdentificationSearch_(const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &fasta_data, const String &adapter_name, Param &parameters) const
Executes the workflow from search adapter, followed by PeptideIndexer and finishes with FDR.
void writeIniFile_(const Param &parameters, const String &filename) const
Writes parameters into a given file.
UInt numberOfUniqueProteins_(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1) const
Determines the number of unique proteins found in the protein accessions of PeptideIdentifications.
~DBSuitability() override=default
Destructor.
double getDecoyCutOff_(const std::vector< PeptideIdentification > &pep_ids, double reranking_cutoff_percentile) const
Calculates a xcorr cut-off based on decoy hits.
double calculateCorrectionFactor_(const SuitabilityData &data, const SuitabilityData &data_sampled, double sampling_rate) const
Calculates the correction factor from two suitability calculations.
double getScoreMatchingFDR_(const std::vector< PeptideIdentification > &pep_ids, double FDR, String score_name, bool higher_score_better) const
Extracts the worst score that still passes a FDR (q-value) threshold.
const std::vector< SuitabilityData > & getResults() const
Returns results calculated by this metric.
std::vector< SuitabilityData > results_
result vector
Definition: DBSuitability.h:243
void appendDecoys_(std::vector< FASTAFile::FASTAEntry > &fasta) const
Calculates and appends decoys to a given vector of FASTAEntry.
void calculateSuitability_(const std::vector< PeptideIdentification > &pep_ids, SuitabilityData &data) const
Calculates all suitability data from a combined deNovo+database search.
bool isNovoHit_(const PeptideHit &hit) const
Tests if a PeptideHit is considered a deNovo hit.
const boost::regex decoy_pattern_
pattern for finding a decoy string
Definition: DBSuitability.h:246
std::pair< String, Param > extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters &search_params) const
Looks through meta values of SearchParameters to find out which search adapter was used.
double extractScore_(const PeptideHit &pep_hit) const
Returns the cross correlation score normalized by MW (if existing), else if the 'force' flag is set t...
double getDecoyDiff_(const PeptideIdentification &pep_id) const
Calculates the xcorr difference between the top two hits marked as decoy.
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:93
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:73
Management and storage of parameters / INI files.
Definition: Param.h:70
Representation of a peptide hit.
Definition: PeptideHit.h:57
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:65
A more convenient string class.
Definition: String.h:60
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
struct to store results
Definition: DBSuitability.h:76
SuitabilityData simulateNoReRanking() const
Returns a SuitabilityData object containing the data if re-ranking didn't happen.
double corr_factor
Definition: DBSuitability.h:141
Search parameters of the DB search.
Definition: ProteinIdentification.h:260